alloc_pages

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* alloc_pages_bulk
@ 2002-07-22 18:40 Martin J. Bligh
  2002-07-22 18:56 ` alloc_pages_bulk Benjamin LaHaise
  2002-07-22 20:26 ` alloc_pages_bulk Andrew Morton
  0 siblings, 2 replies; 6+ messages in thread
From: Martin J. Bligh @ 2002-07-22 18:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Bill Irwin, linux-mm

Below is a first cut at a bulk page allocator. This has no testing whatsoever,
not even being compiled ... I just want to get some feedback on the approach,
so if I get slapped, I'm less far down the path that I have to back out of.
The __alloc_pages cleanup is also tacked on the end because I'm lazy at
creating diff trees - sorry ;-)

Comments, opinions, abuse?

M.

diff -urN virgin-2.5.25/include/linux/gfp.h 2.5.25-A04-alloc_pages_bulk/include/linux/gfp.h
--- virgin-2.5.25/include/linux/gfp.h	Fri Jul  5 16:42:19 2002
+++ 2.5.25-A04-alloc_pages_bulk/include/linux/gfp.h	Mon Jul 22 11:27:49 2002
@@ -45,12 +45,23 @@
 
 static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
 {
+	struct page *page;
 	/*
 	 * Gets optimized away by the compiler.
 	 */
 	if (order >= MAX_ORDER)
 		return NULL;
-	return _alloc_pages(gfp_mask, order);
+	return _alloc_pages(gfp_mask, order, 1, &page);
+}
+
+static inline struct page * alloc_pages_bulk(unsigned int gfp_mask, unsigned int order, unsigned long count, struct page **pages)
+{
+	/*
+	 * Gets optimized away by the compiler.
+	 */
+	if (order >= MAX_ORDER)
+		return NULL;
+	return _alloc_pages(gfp_mask, order, count, pages);
 }
 
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
diff -urN virgin-2.5.25/mm/numa.c 2.5.25-A04-alloc_pages_bulk/mm/numa.c
--- virgin-2.5.25/mm/numa.c	Fri Jul  5 16:42:20 2002
+++ 2.5.25-A04-alloc_pages_bulk/mm/numa.c	Mon Jul 22 11:14:53 2002
@@ -31,32 +31,29 @@
 
 #endif /* !CONFIG_DISCONTIGMEM */
 
-struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order)
+struct page * alloc_pages_node(int nid, unsigned int gfp_mask, 
+		unsigned int order)
 {
-#ifdef CONFIG_NUMA
-	return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK));
-#else
-	return alloc_pages(gfp_mask, order);
-#endif
+	struct page *page;
+
+	return __alloc_pages(gfp_mask, order, 
+		NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)
+		1, &page);
 }
 
 #ifdef CONFIG_DISCONTIGMEM
 
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 
-static spinlock_t node_lock = SPIN_LOCK_UNLOCKED;
-
 void show_free_areas_node(pg_data_t *pgdat)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&node_lock, flags);
 	show_free_areas_core(pgdat);
-	spin_unlock_irqrestore(&node_lock, flags);
 }
 
 /*
- * Nodes can be initialized parallely, in no particular order.
+ * Nodes can be initialized in parallel, in no particular order.
  */
 void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	unsigned long *zones_size, unsigned long zone_start_paddr, 
@@ -82,49 +79,69 @@
 	memset(pgdat->valid_addr_bitmap, 0, size);
 }
 
-static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
-	unsigned int order)
+/* 
+ * Walk the global list of pg_data_t's starting at specified start point 
+ */
+static inline unsigned long _alloc_pages_pgdat_walk(pg_data_t *start,
+		unsigned int gfp_mask, unsigned int order, 
+		unsigned long count, struct page **pages)
 {
-	return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
+	unsigned long page_count = 0;
+	pg_data_t *pgdat = start;
+	
+	/* walk the second part of the list from start to the list tail */
+	for (pgdat = start; pgdat != NULL; pgdat = pgdat->node_next) {
+		if ((page_count = __alloc_pages(pgdat, gfp_mask, order, 
+						pgdat->node_zonelists + 
+						(gfp_mask & GFP_ZONEMASK)
+						count, pages)))
+			return(page_count);
+	}
+	/* walk the first part of the list from the list head to start */
+	for (pgdat = pgdat_list; pgdat != start; pgdat = pgdat->node_next) {
+		if ((page_count = __alloc_pages(pgdat, gfp_mask, order, 
+						pgdat->node_zonelists + 
+						(gfp_mask & GFP_ZONEMASK)
+						count, pages)))
+			return(page_count);
+	}
+	return(0);
 }
 
+#ifdef CONFIG_NUMA
+
 /*
  * This can be refined. Currently, tries to do round robin, instead
  * should do concentratic circle search, starting from current node.
  */
-struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
+unsigned long _alloc_pages(unsigned int gfp_mask, unsigned int order,
+		unsigned long count, struct page **pages)
+{
+	/* start at the current node, then do round robin */	
+	return _alloc_pages_pgdat_walk(NODE_DATA(numa_node_id()), 
+		gfp_mask, order, count, pages);
+}
+
+#else /* !CONFIG_NUMA */
+
+unsigned long _alloc_pages(unsigned int gfp_mask, unsigned int order,
+		unsigned long count, struct page **pages)
 {
-	struct page *ret = 0;
-	pg_data_t *start, *temp;
-#ifndef CONFIG_NUMA
 	unsigned long flags;
 	static pg_data_t *next = 0;
-#endif
+	pgdat_t *temp;
 
-	if (order >= MAX_ORDER)
-		return NULL;
-#ifdef CONFIG_NUMA
-	temp = NODE_DATA(numa_node_id());
-#else
-	spin_lock_irqsave(&node_lock, flags);
+	/* 
+	 * As the next ptr is static, it saves position between calls
+	 * and we round robin between memory segments to balance pressure
+	 */
 	if (!next) next = pgdat_list;
 	temp = next;
 	next = next->node_next;
-	spin_unlock_irqrestore(&node_lock, flags);
-#endif
-	start = temp;
-	while (temp) {
-		if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
-			return(ret);
-		temp = temp->node_next;
-	}
-	temp = pgdat_list;
-	while (temp != start) {
-		if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
-			return(ret);
-		temp = temp->node_next;
-	}
-	return(0);
+
+	return _alloc_pages_pgdat_walk(temp, gfp_mask, order, count, pages);
 }
+
+#endif /* CONFIG_NUMA */
 
 #endif /* CONFIG_DISCONTIGMEM */
diff -urN virgin-2.5.25/mm/page_alloc.c 2.5.25-A04-alloc_pages_bulk/mm/page_alloc.c
--- virgin-2.5.25/mm/page_alloc.c	Fri Jul  5 16:42:03 2002
+++ 2.5.25-A04-alloc_pages_bulk/mm/page_alloc.c	Mon Jul 22 11:15:00 2002
@@ -187,16 +187,13 @@
 	set_page_count(page, 1);
 }
 
-static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
-static struct page * rmqueue(zone_t *zone, unsigned int order)
+static inline struct page * __rmqueue(zone_t *zone, unsigned int order)
 {
 	free_area_t * area = zone->free_area + order;
 	unsigned int curr_order = order;
 	struct list_head *head, *curr;
-	unsigned long flags;
 	struct page *page;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	do {
 		head = &area->free_list;
 		curr = head->next;
@@ -213,21 +210,40 @@
 			zone->free_pages -= 1UL << order;
 
 			page = expand(zone, page, index, order, curr_order, area);
-			spin_unlock_irqrestore(&zone->lock, flags);
-
-			if (bad_range(zone, page))
-				BUG();
-			prep_new_page(page);
 			return page;	
 		}
 		curr_order++;
 		area++;
 	} while (curr_order < MAX_ORDER);
-	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return NULL;
 }
 
+static FASTCALL(unsigned long rmqueue(zone_t *zone, unsigned int order, 
+			unsigned long count, struct page *pages[]));
+static unsigned long rmqueue(zone_t *zone, unsigned int order, 
+		unsigned long count, struct page **pages)
+{
+	unsigned long flags;
+	int i, allocated = 0;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	for (i = 0; i < count; ++i) {
+		pages[i] = __rmqueue(zone, order);
+		if (pages[i] == NULL)
+			break;
+		++allocated;
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+	
+	for (i = 0; i < allocated; ++i) {
+		if (bad_range(zone, pages[i]))
+			BUG();
+		prep_new_page(pages[i]);
+	}
+	return allocated;	
+}
+
 #ifdef CONFIG_SOFTWARE_SUSPEND
 int is_head_of_free_region(struct page *page)
 {
@@ -253,10 +269,12 @@
 #endif /* CONFIG_SOFTWARE_SUSPEND */
 
 #ifndef CONFIG_DISCONTIGMEM
-struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
+unsigned long _alloc_pages(unsigned int gfp_mask, unsigned int order,
+		unsigned long count, struct page **pages)
 {
 	return __alloc_pages(gfp_mask, order,
-		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)
+		count, pages);
 }
 #endif
 
@@ -316,26 +334,27 @@
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
-struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
+unsigned long __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist, unsigned long count, struct page **pages)
 {
 	unsigned long min;
-	zone_t **zone, * classzone;
+	zone_t **zones, * classzone;
 	struct page * page;
-	int freed;
+	int freed, i;
 
-	zone = zonelist->zones;
-	classzone = *zone;
-	if (classzone == NULL)
+	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+	classzone = zones[0]; 
+	if (classzone == NULL)    /* no zones in the zonelist */
 		return NULL;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
 
+	/* Go through the zonelist once, looking for a zone with enough free */
+	min = count << order;
+	for (i = 0; zones[i] != NULL; i++) {
+		zone_t *z = *zones[i];
+
+		/* the incremental min is allegedly to discourage fallback */
 		min += z->pages_low;
 		if (z->free_pages > min) {
-			page = rmqueue(z, order);
+			page = rmqueue(z, order, count, pages);
 			if (page)
 				return page;
 		}
@@ -343,23 +362,22 @@
 
 	classzone->need_balance = 1;
 	mb();
+	/* we're somewhat low on memory, failed to find what we needed */
 	if (waitqueue_active(&kswapd_wait))
 		wake_up_interruptible(&kswapd_wait);
 
-	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
+	/* Go through the zonelist again, taking __GFP_HIGH into account */
+	min = count << order;
+	for (i = 0; zones[i] != NULL; i++) {
 		unsigned long local_min;
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
+		zone_t *z = *zones[i];
 
 		local_min = z->pages_min;
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
 		min += local_min;
 		if (z->free_pages > min) {
-			page = rmqueue(z, order);
+			page = rmqueue(z, order, count, pages);
 			if (page)
 				return page;
 		}
@@ -369,13 +387,11 @@
 
 rebalance:
 	if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
-		zone = zonelist->zones;
-		for (;;) {
-			zone_t *z = *(zone++);
-			if (!z)
-				break;
+		/* go through the zonelist yet again, ignoring mins */
+		for (i = 0; zones[i] != NULL; i++) {
+			zone_t *z = *zones[i];
 
-			page = rmqueue(z, order);
+			page = rmqueue(z, order, count, pages);
 			if (page)
 				return page;
 		}
@@ -396,16 +412,14 @@
 	if (page)
 		return page;
 
-	zone = zonelist->zones;
-	min = 1UL << order;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
+	/* go through the zonelist yet one more time */
+	min = count << order;
+	for (i = 0; zones[i] != NULL; i++) {
+		zone_t *z = *zones[i];
 
 		min += z->pages_min;
 		if (z->free_pages > min) {
-			page = rmqueue(z, order);
+			page = rmqueue(z, order, count, pages);
 			if (page)
 				return page;
 		}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: alloc_pages_bulk
  2002-07-22 18:40 alloc_pages_bulk Martin J. Bligh
@ 2002-07-22 18:56 ` Benjamin LaHaise
  2002-07-22 20:05   ` alloc_pages_bulk Martin J. Bligh
  2002-07-22 20:26 ` alloc_pages_bulk Andrew Morton
  1 sibling, 1 reply; 6+ messages in thread
From: Benjamin LaHaise @ 2002-07-22 18:56 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Andrew Morton, Bill Irwin, linux-mm

On Mon, Jul 22, 2002 at 11:40:48AM -0700, Martin J. Bligh wrote:
> Below is a first cut at a bulk page allocator. This has no testing whatsoever,
> not even being compiled ... I just want to get some feedback on the approach,
> so if I get slapped, I'm less far down the path that I have to back out of.
> The __alloc_pages cleanup is also tacked on the end because I'm lazy at
> creating diff trees - sorry ;-)
> 
> Comments, opinions, abuse?

The inline for alloc_pages is wasteful: regparm on 386 allows us to pass 
3 arguments to a function without going to the stack; by making _alloc_pages 
take an additional argument which is a pointer, the stack manipulations and 
dereference add several instructions of bloat per alloc_pages inline.  Keep 
a seperate entry point around for alloc_pages to avoid this.  Also, what 
effect does this have on single page allocations for single processor and 
dual processor systems?

That said, why use an array?  You could just have alloc_pages return a linked 
list of pages.  This would allow you to make the allocation operation faster 
by doing a single snip of the portion of the list that has the required 
number of pages in the fast case.

		-ben
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: alloc_pages_bulk
  2002-07-22 18:56 ` alloc_pages_bulk Benjamin LaHaise
@ 2002-07-22 20:05   ` Martin J. Bligh
  0 siblings, 0 replies; 6+ messages in thread
From: Martin J. Bligh @ 2002-07-22 20:05 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Andrew Morton, Bill Irwin, linux-mm

> The inline for alloc_pages is wasteful: 

if I understand what you're hitting on here, that was already an inline - I didn't
do that - if the order is static, which it would be most times, that all just dissappears
into a call to _alloc_pages. If we ripped that out, and turned _alloc_pages into
alloc_pages, we'd lose that.

> regparm on 386 allows us to pass 
> 3 arguments to a function without going to the stack; by making _alloc_pages 
> take an additional argument which is a pointer, the stack manipulations and 
> dereference add several instructions of bloat per alloc_pages inline.  Keep 

OK, now that I can understand.

> a seperate entry point around for alloc_pages to avoid this.  Also, what 
> effect does this have on single page allocations for single processor and 
> dual processor systems?

Unmeasured, though it will obviously need to be.

> That said, why use an array?  You could just have alloc_pages return a linked 
> list of pages.  This would allow you to make the allocation operation faster 
> by doing a single snip of the portion of the list that has the required 
> number of pages in the fast case.

By doing the mem allocation for that stuff in the top level alloc pages, it's
just a local that gets freed before exit. If I allocate it lower down, I have to
find some way to track and free it that gets messy. The reason I pushed it
so high up was to avoid code duplication (see the NUMA version of _alloc_pages), 
and I though the cost would be trivial. But as you've pointed out, I've crossed
the line onto the stack, which is obviously bad ... time for me to think things over
again. 

Glad I mailed it out before getting too carried away - thanks very much for the
feedback ;-) I think I reorganised numa.c:_alloc_pages enough during the process
that the code duplication issue may be totally moot ...

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: alloc_pages_bulk
  2002-07-22 18:40 alloc_pages_bulk Martin J. Bligh
  2002-07-22 18:56 ` alloc_pages_bulk Benjamin LaHaise
@ 2002-07-22 20:26 ` Andrew Morton
  2002-07-22 20:35   ` alloc_pages_bulk Martin J. Bligh
  1 sibling, 1 reply; 6+ messages in thread
From: Andrew Morton @ 2002-07-22 20:26 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Bill Irwin, linux-mm

"Martin J. Bligh" wrote:
> 
>                 min += z->pages_min;
>                 if (z->free_pages > min) {
> -                       page = rmqueue(z, order);
> +                       page = rmqueue(z, order, count, pages);

This won't compile because your rmqueue() no longer returns a
page pointer.  Minor point, but the weightier question is:
what to do when you fix it up?  If the attempt to allocate N
pages would cause a watermark to be crossed then should we

a) ignore it, and just return the N pages anyway?

   Has a risk that a zillion CPUs could all hit the same code
   at the same time and would exhaust the page reserves.

   That's not very worrying, really.

b) Stop allocating pages when we hit the watermark and return
   a partial result to the caller

c) Stop allocating pages at the watermark, run off and do
   some page reclaim and start allocating pages again.

   This implies that we should be releasing more than one
   page into current->local_pages.

   Probably we should be doing this anyway - just queue _all_ the
   liberated pages onto local_pages, then satisfy the request from that
   pool, then gang-free the remainder.  That's a straight-line speedup
   and lock-banging optimisation against the existing code.

d) Look at pages_min versus count before allocating any pages.  If
   the allocation of `count' pagess would cause ->free_pages to fall
   below min, then go run some reclaim _first_, and then go grab a
   number of pages.  That number is min(count, nr_pages_we_just_reclaimed).
   So the caller may see a partial result.

I think d), yes?

-
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: alloc_pages_bulk
  2002-07-22 20:26 ` alloc_pages_bulk Andrew Morton
@ 2002-07-22 20:35   ` Martin J. Bligh
  2002-07-22 20:58     ` alloc_pages_bulk Andrew Morton
  0 siblings, 1 reply; 6+ messages in thread
From: Martin J. Bligh @ 2002-07-22 20:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Bill Irwin, linux-mm

>>                 min += z->pages_min;
>>                 if (z->free_pages > min) {
>> -                       page = rmqueue(z, order);
>> +                       page = rmqueue(z, order, count, pages);
> 
> This won't compile because your rmqueue() no longer returns a
> page pointer.  

Arse. Will fix that in a second.

> d) Look at pages_min versus count before allocating any pages.  If
>    the allocation of `count' pagess would cause ->free_pages to fall
>    below min, then go run some reclaim _first_, and then go grab a
>    number of pages.  That number is min(count, nr_pages_we_just_reclaimed).
>    So the caller may see a partial result.
> 
> I think d), yes?

I was under the impression I was already doing that by this bit at the
start of the loop:

-       min = 1UL << order;
+       min = count << order;

Is that sufficient already (obviously my returns from rmqueue are total crap, 
but other than that ... ;-))

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: alloc_pages_bulk
  2002-07-22 20:35   ` alloc_pages_bulk Martin J. Bligh
@ 2002-07-22 20:58     ` Andrew Morton
  0 siblings, 0 replies; 6+ messages in thread
From: Andrew Morton @ 2002-07-22 20:58 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Bill Irwin, linux-mm

"Martin J. Bligh" wrote:
> 
> >>                 min += z->pages_min;
> >>                 if (z->free_pages > min) {
> >> -                       page = rmqueue(z, order);
> >> +                       page = rmqueue(z, order, count, pages);
> >
> > This won't compile because your rmqueue() no longer returns a
> > page pointer.
> 
> Arse. Will fix that in a second.

No rush.  Need to write some code which uses the interface
first...

> > d) Look at pages_min versus count before allocating any pages.  If
> >    the allocation of `count' pagess would cause ->free_pages to fall
> >    below min, then go run some reclaim _first_, and then go grab a
> >    number of pages.  That number is min(count, nr_pages_we_just_reclaimed).
> >    So the caller may see a partial result.
> >
> > I think d), yes?
> 
> I was under the impression I was already doing that by this bit at the
> start of the loop:
> 
> -       min = 1UL << order;
> +       min = count << order;
> 
> Is that sufficient already (obviously my returns from rmqueue are total crap,
> but other than that ... ;-))

Ah.  That'll work.

-
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2002-07-22 20:58 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-07-22 18:40 alloc_pages_bulk Martin J. Bligh
2002-07-22 18:56 ` alloc_pages_bulk Benjamin LaHaise
2002-07-22 20:05   ` alloc_pages_bulk Martin J. Bligh
2002-07-22 20:26 ` alloc_pages_bulk Andrew Morton
2002-07-22 20:35   ` alloc_pages_bulk Martin J. Bligh
2002-07-22 20:58     ` alloc_pages_bulk Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox