[PATCH]: Clean up of __alloc

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH]: Clean up of __alloc_pages
@ 2005-10-01 19:00 Seth, Rohit
  2005-10-02  3:09 ` Nick Piggin
  2005-10-03 15:34 ` Christoph Lameter
  0 siblings, 2 replies; 45+ messages in thread
From: Seth, Rohit @ 2005-10-01 19:00 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, linux-kernel

	[PATCH]: Below is the cleaning up of __alloc_pages code.  Few 
		 things different from original version are

	1: remove the initial direct reclaim logic 
	2: order zero pages are now first looked into pcp list upfront
	3: GFP_HIGH pages are allowed to go little below low watermark sooner
	4: Search for free pages unconditionally after direct reclaim

	Signed-off-by: Rohit Seth <rohit.seth@intel.com>


--- linux-2.6.14-rc2-mm1.org/mm/page_alloc.c	2005-09-27 10:03:51.000000000 -0700
+++ linux-2.6.14-rc2-mm1/mm/page_alloc.c	2005-10-01 10:40:06.000000000 -0700
@@ -722,7 +722,8 @@
  * or two.
  */
 static struct page *
-buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags,
+			int replenish)
 {
 	unsigned long flags;
 	struct page *page = NULL;
@@ -733,7 +734,7 @@
 
 		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
-		if (pcp->count <= pcp->low)
+		if ((pcp->count <= pcp->low) && replenish)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 		if (pcp->count) {
@@ -743,9 +744,7 @@
 		}
 		local_irq_restore(flags);
 		put_cpu();
-	}
-
-	if (page == NULL) {
+	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
@@ -858,6 +857,44 @@
 }
 #endif /* CONFIG_PAGE_OWNER */
 
+/* This function get_page_from_freeliest loops through all the possible zones
+ * to find out if it can allocate a page.  can_try_harder can have following 
+ * values:
+ * -1 => No need to check for the watermarks.
+ *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
+ *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)
+ */
+
+static inline struct page *
+get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order, 
+			struct zone **zones, int can_try_harder)
+{
+	struct zone *z;
+	struct page *page = NULL;
+	int classzone_idx = zone_idx(zones[0]);
+	int i;
+
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		if (!cpuset_zone_allowed(z, gfp_mask))
+			continue;
+
+		if ((can_try_harder >= 0) && 
+			(!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, can_try_harder, 
+				       gfp_mask & __GFP_HIGH)))
+			continue;
+
+		page = buffered_rmqueue(z, order, gfp_mask, 1);
+		if (page) 
+			break;
+	}
+	return page;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -866,15 +903,13 @@
 		struct zonelist *zonelist)
 {
 	const int wait = gfp_mask & __GFP_WAIT;
-	struct zone **zones, *z;
+	struct zone **zones, *z = NULL;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
-	int did_some_progress;
 
 	might_sleep_if(wait);
 
@@ -892,45 +927,23 @@
 		return NULL;
 	}
 
-	classzone_idx = zone_idx(zones[0]);
-
-restart:
-	/*
-	 * Go through the zonelist once, looking for a zone with enough free.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	/* First check if we can easily take a page from any pcp list
+	 * for 0 order pages.  Don't replenish any pcp list at this time.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
-		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-			continue;
-
-		/*
-		 * If the zone is to attempt early page reclaim then this loop
-		 * will try to reclaim pages and check the watermark a second
-		 * time before giving up and falling back to the next zone.
-		 */
-zone_reclaim_retry:
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0)) {
-			if (!do_reclaim)
-				continue;
-			else {
-				zone_reclaim(z, gfp_mask, order);
-				/* Only try reclaim once */
-				do_reclaim = 0;
-				goto zone_reclaim_retry;
-			}
+	if (order == 0) {
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			page = buffered_rmqueue(z, 0, gfp_mask, 0);
+			if (page) 
+				goto got_pg;
 		}
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
 	}
+restart:
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0);
+	if (page)
+		goto got_pg;
 
 	for (i = 0; (z = zones[i]) != NULL; i++)
 		wakeup_kswapd(z, order);
-
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks to go deeper into reserves
@@ -939,19 +952,12 @@
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		if (!zone_watermark_ok(z, order, z->pages_min,
-				       classzone_idx, can_try_harder,
-				       gfp_mask & __GFP_HIGH))
-			continue;
 
-		if (wait && !cpuset_zone_allowed(z, gfp_mask))
-			continue;
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
+	if (!wait)
+		page = get_page_from_freelist(gfp_mask, order, zones, 
+						can_try_harder);
+	if (page)
+		goto got_pg;
 
 	/* This allocation should allow future memory freeing. */
 
@@ -959,13 +965,9 @@
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
-			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z, gfp_mask))
-					continue;
-				page = buffered_rmqueue(z, order, gfp_mask);
-				if (page)
-					goto got_pg;
-			}
+			page = get_page_from_freelist(gfp_mask, order, zones, -1);
+			if (page)
+				goto got_pg;
 		}
 		goto nopage;
 	}
@@ -982,47 +984,21 @@
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zones, gfp_mask);
+	i = try_to_free_pages(zones, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
 	cond_resched();
 
-	if (likely(did_some_progress)) {
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_min,
-					       classzone_idx, can_try_harder,
-					       gfp_mask & __GFP_HIGH))
-				continue;
-
-			if (!cpuset_zone_allowed(z, gfp_mask))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+	page = get_page_from_freelist(gfp_mask, order, zones,
+					can_try_harder);
+	if (page)
+		goto got_pg;
+	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
+		 * Start the OOM for this scenario.
 		 */
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_high,
-					       classzone_idx, 0, 0))
-				continue;
-
-			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
-
 		out_of_memory(gfp_mask, order);
 		goto restart;
 	}
@@ -1060,7 +1036,7 @@
 #ifdef CONFIG_PAGE_OWNER
 	set_page_owner(page, order, gfp_mask);
 #endif
-	zone_statistics(zonelist, z);
+	zone_statistics(zonelist, page_zone(page));
 	return page;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-01 19:00 [PATCH]: Clean up of __alloc_pages Seth, Rohit
@ 2005-10-02  3:09 ` Nick Piggin
  2005-10-03 16:50   ` Rohit Seth
  2005-10-03 15:34 ` Christoph Lameter
  1 sibling, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-10-02  3:09 UTC (permalink / raw)
  To: Seth, Rohit; +Cc: akpm, linux-mm, linux-kernel

Seth, Rohit wrote:
> 	[PATCH]: Below is the cleaning up of __alloc_pages code.  Few 
> 		 things different from original version are
> 
> 	1: remove the initial direct reclaim logic 
> 	2: order zero pages are now first looked into pcp list upfront
> 	3: GFP_HIGH pages are allowed to go little below low watermark sooner
> 	4: Search for free pages unconditionally after direct reclaim
> 
> 	Signed-off-by: Rohit Seth <rohit.seth@intel.com>
> 

Hi,

Seems pretty good at a quick glance.

Perhaps splitting it into 2 would be a good idea - ie. first
patch does the cleanup, second does the direct pcp list alloc.

Regarding the direct pcp list allocation - I think it is a good
idea, because we're currently already accounting pcp list pages
as being 'allocated' for the purposes of the reclaim watermarks.

Also, the structure is there to avoid touching cachelines whenever
possible so it makes sense to use it early here. Do you have any
performance numbers or allocation statistics (e.g. %pcp hits) to
show?

Also, I would really think about uninlining get_page_from_freelist,
and inlining buffered_rmqueue, so that the constant 'replenish'
argument can be propogated into buffered_rmqueue and should allow
for some nice optimisations. While not bloating the code too much
because your get_page_from_freelist becomes out of line.

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-02  3:09 ` Nick Piggin
@ 2005-10-03 16:50   ` Rohit Seth
  0 siblings, 0 replies; 45+ messages in thread
From: Rohit Seth @ 2005-10-03 16:50 UTC (permalink / raw)
  To: Nick Piggin; +Cc: akpm, linux-mm, linux-kernel

On Sun, 2005-10-02 at 13:09 +1000, Nick Piggin wrote:

> Perhaps splitting it into 2 would be a good idea - ie. first
> patch does the cleanup, second does the direct pcp list alloc.
> 
> Regarding the direct pcp list allocation - I think it is a good
> idea, because we're currently already accounting pcp list pages
> as being 'allocated' for the purposes of the reclaim watermarks.
> 

Right.  

> Also, the structure is there to avoid touching cachelines whenever
> possible so it makes sense to use it early here. Do you have any
> performance numbers or allocation statistics (e.g. %pcp hits) to
> show?
> 

No, I don't have any data at this point to share.

> Also, I would really think about uninlining get_page_from_freelist,
> and inlining buffered_rmqueue, so that the constant 'replenish'
> argument can be propogated into buffered_rmqueue and should allow
> for some nice optimisations. While not bloating the code too much
> because your get_page_from_freelist becomes out of line.

I will do that.

Thanks for your feedback,
-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-01 19:00 [PATCH]: Clean up of __alloc_pages Seth, Rohit
  2005-10-02  3:09 ` Nick Piggin
@ 2005-10-03 15:34 ` Christoph Lameter
  2005-10-03 16:55   ` Rohit Seth
  1 sibling, 1 reply; 45+ messages in thread
From: Christoph Lameter @ 2005-10-03 15:34 UTC (permalink / raw)
  To: Seth, Rohit; +Cc: akpm, linux-mm, linux-kernel

On Sat, 1 Oct 2005, Seth, Rohit wrote:

> -				goto zone_reclaim_retry;
> -			}
> +	if (order == 0) {
> +		for (i = 0; (z = zones[i]) != NULL; i++) {
> +			page = buffered_rmqueue(z, 0, gfp_mask, 0);
> +			if (page) 
> +				goto got_pg;
>  		}
> -

This is checking all zones for pages on the pcp before going the more 
expensive route?

Seems that this removes the logic intended to prefer local 
allocations over remote pages present in the existing alloc_pages? There 
is the danger that this modification will lead to the allocation of remote 
pages even if local pages are available. Thus reducing performance.

I would suggest to just check the first zone's pcp instead of all zones.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-03 15:34 ` Christoph Lameter
@ 2005-10-03 16:55   ` Rohit Seth
  2005-10-03 16:57     ` Christoph Lameter
  0 siblings, 1 reply; 45+ messages in thread
From: Rohit Seth @ 2005-10-03 16:55 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: akpm, linux-mm, linux-kernel

On Mon, 2005-10-03 at 08:34 -0700, Christoph Lameter wrote:
> On Sat, 1 Oct 2005, Seth, Rohit wrote:
> 
> > -				goto zone_reclaim_retry;
> > -			}
> > +	if (order == 0) {
> > +		for (i = 0; (z = zones[i]) != NULL; i++) {
> > +			page = buffered_rmqueue(z, 0, gfp_mask, 0);
> > +			if (page) 
> > +				goto got_pg;
> >  		}
> > -
> 
> This is checking all zones for pages on the pcp before going the more 
> expensive route?
> 

That is right.

> Seems that this removes the logic intended to prefer local 
> allocations over remote pages present in the existing alloc_pages? There 
> is the danger that this modification will lead to the allocation of remote 
> pages even if local pages are available. Thus reducing performance.
> 

Good catch.  I will up level the cpuset check in buffered_rmqueue rather
then doing it in get_page_from_freelist.  That should retain the current
preferences for local pages.

> I would suggest to just check the first zone's pcp instead of all zones.
> 

Na. This for most cases will be ZONE_DMA pcp list having nothing much
most of the time.  And picking any other zone randomly will be exposed
to faulty behavior.

Thanks,
-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-03 16:55   ` Rohit Seth
@ 2005-10-03 16:57     ` Christoph Lameter
  2005-10-03 17:48       ` Rohit Seth
  0 siblings, 1 reply; 45+ messages in thread
From: Christoph Lameter @ 2005-10-03 16:57 UTC (permalink / raw)
  To: Rohit Seth; +Cc: akpm, linux-mm, linux-kernel

On Mon, 3 Oct 2005, Rohit Seth wrote:

> > Seems that this removes the logic intended to prefer local 
> > allocations over remote pages present in the existing alloc_pages? There 
> > is the danger that this modification will lead to the allocation of remote 
> > pages even if local pages are available. Thus reducing performance.
> Good catch.  I will up level the cpuset check in buffered_rmqueue rather
> then doing it in get_page_from_freelist.  That should retain the current
> preferences for local pages.

This is not only the cpuset check. If there is memory available in an 
earlier zone then it needs to be taken regardless of later pcp's 
the zonelist containing pages. Otherwise we did not take the pages nearest 
to the requested node.

> > I would suggest to just check the first zone's pcp instead of all zones.
> > 
> 
> Na. This for most cases will be ZONE_DMA pcp list having nothing much
> most of the time.  And picking any other zone randomly will be exposed
> to faulty behavior.

Maybe only check the first node?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-03 16:57     ` Christoph Lameter
@ 2005-10-03 17:48       ` Rohit Seth
  2005-10-04 13:27         ` Andi Kleen
  0 siblings, 1 reply; 45+ messages in thread
From: Rohit Seth @ 2005-10-03 17:48 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: akpm, linux-mm, linux-kernel

On Mon, 2005-10-03 at 09:57 -0700, Christoph Lameter wrote:
> On Mon, 3 Oct 2005, Rohit Seth wrote:
> 
> > > Seems that this removes the logic intended to prefer local 
> > > allocations over remote pages present in the existing alloc_pages? There 
> > > is the danger that this modification will lead to the allocation of remote 
> > > pages even if local pages are available. Thus reducing performance.
> > Good catch.  I will up level the cpuset check in buffered_rmqueue rather
> > then doing it in get_page_from_freelist.  That should retain the current
> > preferences for local pages.
> 
> This is not only the cpuset check. If there is memory available in an 
> earlier zone then it needs to be taken regardless of later pcp's 
> the zonelist containing pages. Otherwise we did not take the pages nearest 
> to the requested node.
> 

Ah.  Okay.

> > > I would suggest to just check the first zone's pcp instead of all zones.
> > > 
> > 
> > Na. This for most cases will be ZONE_DMA pcp list having nothing much
> > most of the time.  And picking any other zone randomly will be exposed
> > to faulty behavior.
> 
> Maybe only check the first node?
> 

I think conceptually this ask for a new flag __GFP_NODEONLY that
indicate allocations to come from current node only. 

This definitely though means I will need to separate out the allocation
from pcp patch (as Nick suggested earlier).

-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-03 17:48       ` Rohit Seth
@ 2005-10-04 13:27         ` Andi Kleen
  2005-10-04 16:26           ` Ray Bryant
  0 siblings, 1 reply; 45+ messages in thread
From: Andi Kleen @ 2005-10-04 13:27 UTC (permalink / raw)
  To: Rohit Seth; +Cc: akpm, linux-mm, linux-kernel

Rohit Seth <rohit.seth@intel.com> writes:
> 
> I think conceptually this ask for a new flag __GFP_NODEONLY that
> indicate allocations to come from current node only. 
> 
> This definitely though means I will need to separate out the allocation
> from pcp patch (as Nick suggested earlier).

This reminds me - the current logic is currently a bit suboptimal on
many NUMA systems. Often it would be better to be a bit more
aggressive at freeing memory (maybe do a very low overhead light try to
free pages) in the first node before falling back to other nodes. What
right now happens is that when you have even minor memory pressure
because e.g. you node is filled up with disk cache the local memory
affinity doesn't work too well anymore.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-04 13:27         ` Andi Kleen
@ 2005-10-04 16:26           ` Ray Bryant
  2005-10-04 16:10             ` Martin J. Bligh
  0 siblings, 1 reply; 45+ messages in thread
From: Ray Bryant @ 2005-10-04 16:26 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Rohit Seth, akpm, linux-mm, linux-kernel

On Tuesday 04 October 2005 08:27, Andi Kleen wrote:
> Rohit Seth <rohit.seth@intel.com> writes:
> > I think conceptually this ask for a new flag __GFP_NODEONLY that
> > indicate allocations to come from current node only.
> >
> > This definitely though means I will need to separate out the allocation
> > from pcp patch (as Nick suggested earlier).
>
> This reminds me - the current logic is currently a bit suboptimal on
> many NUMA systems. Often it would be better to be a bit more
> aggressive at freeing memory (maybe do a very low overhead light try to
> free pages) in the first node before falling back to other nodes. What
> right now happens is that when you have even minor memory pressure
> because e.g. you node is filled up with disk cache the local memory
> affinity doesn't work too well anymore.
>
> -Andi
>
That's exactly what Martin Hick's additions to __alloc_pages() were trying to 
achieve.   However, we've never figured out how to make the "very low 
overhead light try to free pages" thing work with low enough overhead that it 
can be left on all of the time.    As soon as we make this the least bit more 
expensive, then this hurts those workloads (file servers being one example) 
who don't care about local, but who need the fastest possible allocations. 

This problem is often a showstopper on larger NUMA systems, at least for HPC 
type applications, where the inability to guarantee local storage allocation 
when it is requested can make the application run significantly slower.
-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-04 16:26           ` Ray Bryant
@ 2005-10-04 16:10             ` Martin J. Bligh
  2005-10-04 17:02               ` Ray Bryant
  0 siblings, 1 reply; 45+ messages in thread
From: Martin J. Bligh @ 2005-10-04 16:10 UTC (permalink / raw)
  To: Ray Bryant, Andi Kleen; +Cc: Rohit Seth, akpm, linux-mm, linux-kernel


--Ray Bryant <raybry@mpdtxmail.amd.com> wrote (on Tuesday, October 04, 2005 11:26:52 -0500):

> On Tuesday 04 October 2005 08:27, Andi Kleen wrote:
>> Rohit Seth <rohit.seth@intel.com> writes:
>> > I think conceptually this ask for a new flag __GFP_NODEONLY that
>> > indicate allocations to come from current node only.
>> > 
>> > This definitely though means I will need to separate out the allocation
>> > from pcp patch (as Nick suggested earlier).
>> 
>> This reminds me - the current logic is currently a bit suboptimal on
>> many NUMA systems. Often it would be better to be a bit more
>> aggressive at freeing memory (maybe do a very low overhead light try to
>> free pages) in the first node before falling back to other nodes. What
>> right now happens is that when you have even minor memory pressure
>> because e.g. you node is filled up with disk cache the local memory
>> affinity doesn't work too well anymore.
>> 
>> -Andi
>> 
> That's exactly what Martin Hick's additions to __alloc_pages() were trying to 
> achieve.   However, we've never figured out how to make the "very low 
> overhead light try to free pages" thing work with low enough overhead that it 
> can be left on all of the time.    As soon as we make this the least bit more 
> expensive, then this hurts those workloads (file servers being one example) 
> who don't care about local, but who need the fastest possible allocations. 
> 
> This problem is often a showstopper on larger NUMA systems, at least for HPC 
> type applications, where the inability to guarantee local storage allocation 
> when it is requested can make the application run significantly slower.

Can we not do some migration / more targeted pressure balancing in kswapd?
Ie if we had a constant measure of per-node pressure, we could notice an
imbalance, and start migrating the least recently used pages from the node
under most pressure to the node under least ...

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-04 16:10             ` Martin J. Bligh
@ 2005-10-04 17:02               ` Ray Bryant
  0 siblings, 0 replies; 45+ messages in thread
From: Ray Bryant @ 2005-10-04 17:02 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Andi Kleen, Rohit Seth, akpm, linux-mm, linux-kernel

On Tuesday 04 October 2005 11:10, Martin J. Bligh wrote:
> --Ray Bryant <raybry@mpdtxmail.amd.com> wrote (on Tuesday, October 04, 2005 
>
 <snip> 

> >
> > That's exactly what Martin Hick's additions to __alloc_pages() were
> > trying to achieve.   However, we've never figured out how to make the
> > "very low overhead light try to free pages" thing work with low enough
> > overhead that it can be left on all of the time.    As soon as we make
> > this the least bit more expensive, then this hurts those workloads (file
> > servers being one example) who don't care about local, but who need the
> > fastest possible allocations.
> >
> > This problem is often a showstopper on larger NUMA systems, at least for
> > HPC type applications, where the inability to guarantee local storage
> > allocation when it is requested can make the application run
> > significantly slower.
>
> Can we not do some migration / more targeted pressure balancing in kswapd?
> Ie if we had a constant measure of per-node pressure, we could notice an
> imbalance, and start migrating the least recently used pages from the node
> under most pressure to the node under least ...
>
> M.
>

Unfortunately, I think that anything that doesn't happen synchronously (with 
the original allocation request) happens to late.    In order to make this 
work we'd have to not only migrate the excess page cache pages off of the 
loaded nodes, but find the supposed-to-be-local pages that got allocated off 
node and migrate them back.   I'm not sure how to do all of that.  :-)

(Another way to think about this is that if we truly balanced page cache 
allocations across nodes via migration, then we could have equally as well 
originally allocated the page cache pages round-robin across the nodes and 
gotten the balancing for free.)

Another idea that I have been kicking around for some time is the notion of a
MPOL_LOCAL memory policy.    MPOL_LOCAL would be kind of like a dynamic 
MPOL_BIND in the sense that the policy would allow allocation only on a 
single node, but that signal node would be dynamically set to the current 
node (rather than statically set ala MPOL_BIND).    The result of such  an 
allocation policy would be that if a local allocation can't be made, then the 
current task will end up in synchronous page reclaim until some memory 
becomes free.  That would force clean page cache pages to be discarded.
(This idea is similar, in spirit, to Rohit's suggestion for __GFP_NODEONLY...)

The intent of this is for an otherwise mempolicy unaware application to be run 
under numactl and MPOL_LOCAL if it requires local storage allocations.
File server applications (or other applications that demand the fastest 
storage allocation without regard to local) would chose not to be run in such 
a fashion.

The down side of this is that a task that requests more memory than is 
available on a node (and is running under MPOL_LOCAL) could invoke the OOM 
handler.    We'd have to special case this under the OOM handler, I would 
imagine, although a similar problem may exist with MPOL_BIND at the present 
time, anyway.

> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Ray Bryant
AMD Performance Labs                   Austin, Tx
512-602-0038 (o)                 512-507-7807 (c)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* [PATCH]: Clean up of __alloc_pages
@ 2005-10-29  1:33 Rohit, Seth
  2005-10-29  2:33 ` Nick Piggin
                   ` (2 more replies)
  0 siblings, 3 replies; 45+ messages in thread
From: Rohit, Seth @ 2005-10-29  1:33 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, linux-kernel

the only changes in this clean up are:

	1- remove the initial direct reclaim logic
	2- GFP_HIGH pages are allowed to go little below low watermark sooner
	3- Search for free pages unconditionally after direct reclaim

I've not added the logic of looking into PCPs first in this rev of patch.  I will send a
seperate patch for adding that support (needing extra logic for NUMA).

	Signed-off-by: Rohit Seth <rohit.seth@intel.com>


diff -Naru linux-2.6.14.org/mm/page_alloc.c linux-2.6.14/mm/page_alloc.c
--- linux-2.6.14.org/mm/page_alloc.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14/mm/page_alloc.c	2005-10-28 10:11:39.000000000 -0700
@@ -685,8 +685,8 @@
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static inline struct page *
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags, int replenish)
 {
 	unsigned long flags;
 	struct page *page = NULL;
@@ -697,7 +697,7 @@
 
 		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
-		if (pcp->count <= pcp->low)
+		if ((pcp->count <= pcp->low) && replenish)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 		if (pcp->count) {
@@ -707,9 +707,7 @@
 		}
 		local_irq_restore(flags);
 		put_cpu();
-	}
-
-	if (page == NULL) {
+	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
@@ -770,6 +768,44 @@
 	return 1;
 }
 
+/* get_page_from_freeliest loops through all the possible zones
+ * to find out if it can allocate a page.  can_try_harder can have following 
+ * values:
+ * -1 => No need to check for the watermarks.
+ *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
+ *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)
+ */
+
+static struct page *
+get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order, 
+			struct zone **zones, int can_try_harder)
+{
+	struct zone *z;
+	struct page *page = NULL;
+	int classzone_idx = zone_idx(zones[0]);
+	int i;
+
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		if (!cpuset_zone_allowed(z, gfp_mask))
+			continue;
+
+		if ((can_try_harder >= 0) && 
+			(!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, can_try_harder, 
+				       gfp_mask & __GFP_HIGH)))
+			continue;
+
+		page = buffered_rmqueue(z, order, gfp_mask, 1);
+		if (page) 
+			break;
+	}
+	return page;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -778,15 +814,13 @@
 		struct zonelist *zonelist)
 {
 	const int wait = gfp_mask & __GFP_WAIT;
-	struct zone **zones, *z;
+	struct zone **zones, *z = NULL;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
-	int did_some_progress;
 
 	might_sleep_if(wait);
 
@@ -803,42 +837,10 @@
 		/* Should this ever happen?? */
 		return NULL;
 	}
-
-	classzone_idx = zone_idx(zones[0]);
-
 restart:
-	/*
-	 * Go through the zonelist once, looking for a zone with enough free.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
-		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-			continue;
-
-		/*
-		 * If the zone is to attempt early page reclaim then this loop
-		 * will try to reclaim pages and check the watermark a second
-		 * time before giving up and falling back to the next zone.
-		 */
-zone_reclaim_retry:
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0)) {
-			if (!do_reclaim)
-				continue;
-			else {
-				zone_reclaim(z, gfp_mask, order);
-				/* Only try reclaim once */
-				do_reclaim = 0;
-				goto zone_reclaim_retry;
-			}
-		}
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0);
+	if (page)
+		goto got_pg;
 
 	for (i = 0; (z = zones[i]) != NULL; i++)
 		wakeup_kswapd(z, order);
@@ -851,19 +853,11 @@
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		if (!zone_watermark_ok(z, order, z->pages_min,
-				       classzone_idx, can_try_harder,
-				       gfp_mask & __GFP_HIGH))
-			continue;
-
-		if (wait && !cpuset_zone_allowed(z, gfp_mask))
-			continue;
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
+	if (!wait)
+		page = get_page_from_freelist(gfp_mask, order, zones, 
+						can_try_harder);
+	if (page)
+		goto got_pg;
 
 	/* This allocation should allow future memory freeing. */
 
@@ -871,13 +865,9 @@
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
-			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z, gfp_mask))
-					continue;
-				page = buffered_rmqueue(z, order, gfp_mask);
-				if (page)
-					goto got_pg;
-			}
+			page = get_page_from_freelist(gfp_mask, order, zones, -1);
+			if (page)
+				goto got_pg;
 		}
 		goto nopage;
 	}
@@ -894,47 +884,20 @@
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zones, gfp_mask);
+	i = try_to_free_pages(zones, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
 	cond_resched();
 
-	if (likely(did_some_progress)) {
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_min,
-					       classzone_idx, can_try_harder,
-					       gfp_mask & __GFP_HIGH))
-				continue;
-
-			if (!cpuset_zone_allowed(z, gfp_mask))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+	page = get_page_from_freelist(gfp_mask, order, zones, can_try_harder);
+	if (page)
+		goto got_pg;
+	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
+		 * Start the OOM here.
 		 */
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_high,
-					       classzone_idx, 0, 0))
-				continue;
-
-			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
-
 		out_of_memory(gfp_mask, order);
 		goto restart;
 	}
@@ -968,7 +931,7 @@
 	}
 	return NULL;
 got_pg:
-	zone_statistics(zonelist, z);
+	zone_statistics(zonelist, page_zone(page));
 	return page;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-29  1:33 Rohit, Seth
@ 2005-10-29  2:33 ` Nick Piggin
  2005-10-31 20:55   ` Rohit Seth
  2005-10-30  0:16 ` Paul Jackson
  2005-10-30  1:47 ` Paul Jackson
  2 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-10-29  2:33 UTC (permalink / raw)
  To: Rohit, Seth; +Cc: akpm, linux-mm, linux-kernel

Rohit, Seth wrote:
> the only changes in this clean up are:
> 

Looking good. I imagine it must be good for icache.
Man, the page allocator somehow turned unreadable since I last
looked at it! We will want this patch.

> 	1- remove the initial direct reclaim logic
> 	2- GFP_HIGH pages are allowed to go little below low watermark sooner

I don't think #2 is any good. The reason we don't check GFP_HIGH on
the first time round is because we simply want to kick kswapd at its
normal watermark - ie. it doesn't matter what kind of allocation this
is, kswapd should start at the same time no matter what.

If you don't do this, then a GFP_HIGH allocator can allocate right
down to its limit before it kicks kswapd, then it either will fail or
will have to do direct reclaim.

I would be inclined to simply add a int gfp_high argument to
get_page_from_freelist, which would also somewhat match zone_watermark_ok.

> 	3- Search for free pages unconditionally after direct reclaim
> 
> I've not added the logic of looking into PCPs first in this rev of patch.  I will send a
> seperate patch for adding that support (needing extra logic for NUMA).
> 
> 	Signed-off-by: Rohit Seth <rohit.seth@intel.com>
> 

One other comment below:

> +
> +static struct page *
> +get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order, 
> +			struct zone **zones, int can_try_harder)
> +{
> +	struct zone *z;
> +	struct page *page = NULL;
> +	int classzone_idx = zone_idx(zones[0]);
> +	int i;
> +
> +	/*
> +	 * Go through the zonelist once, looking for a zone with enough free.
> +	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> +	 */
> +	for (i = 0; (z = zones[i]) != NULL; i++) {
> +		if (!cpuset_zone_allowed(z, gfp_mask))
> +			continue;
> +
> +		if ((can_try_harder >= 0) && 
> +			(!zone_watermark_ok(z, order, z->pages_low,
> +				       classzone_idx, can_try_harder, 
> +				       gfp_mask & __GFP_HIGH)))
> +			continue;
> +
> +		page = buffered_rmqueue(z, order, gfp_mask, 1);
> +		if (page) 
> +			break;
> +	}
> +	return page;
> +}

[snip]

> @@ -968,7 +931,7 @@
>  	}
>  	return NULL;
>  got_pg:
> -	zone_statistics(zonelist, z);
> +	zone_statistics(zonelist, page_zone(page));
>  	return page;

How about moving the zone_statistics up into the 'if (page)'
test of get_page_from_freelist? This way we don't have to
evaluate page_zone().

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-29  2:33 ` Nick Piggin
@ 2005-10-31 20:55   ` Rohit Seth
  2005-11-01  1:14     ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Rohit Seth @ 2005-10-31 20:55 UTC (permalink / raw)
  To: Nick Piggin; +Cc: akpm, linux-mm, linux-kernel

On Sat, 2005-10-29 at 12:33 +1000, Nick Piggin wrote:

> Rohit, Seth wrote:
> > the only changes in this clean up are:
> > 
> 
> Looking good. I imagine it must be good for icache.
> Man, the page allocator somehow turned unreadable since I last
> looked at it! We will want this patch.
> 

Thanks for your comments.

> > 	1- remove the initial direct reclaim logic
> > 	2- GFP_HIGH pages are allowed to go little below low watermark sooner
> 
> I don't think #2 is any good. The reason we don't check GFP_HIGH on
> the first time round is because we simply want to kick kswapd at its
> normal watermark - ie. it doesn't matter what kind of allocation this
> is, kswapd should start at the same time no matter what.
> 
> If you don't do this, then a GFP_HIGH allocator can allocate right
> down to its limit before it kicks kswapd, then it either will fail or
> will have to do direct reclaim.
> 

You are right if there are only GFP_HIGH requests coming in then the
allocation will go down to (min - min/2) before kicking in kswapd.
Though if the requester is not ready to wait, there is another good shot
at allocation succeed before we get into direct reclaim (and this is
happening based on can_try_harder flag).

> >
> >  got_pg:
> > -	zone_statistics(zonelist, z);
> > +	zone_statistics(zonelist, page_zone(page));
> >  	return page;
> 
> How about moving the zone_statistics up into the 'if (page)'
> test of get_page_from_freelist? This way we don't have to
> evaluate page_zone().
> 

Let us keep this as is for now.  Will revisit once after the
pcp_prefer_allocation patches get in place. 

Thanks,
-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-31 20:55   ` Rohit Seth
@ 2005-11-01  1:14     ` Nick Piggin
  2005-11-04 18:15       ` Rohit Seth
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-11-01  1:14 UTC (permalink / raw)
  To: Rohit Seth; +Cc: akpm, linux-mm, linux-kernel

Rohit Seth wrote:
> On Sat, 2005-10-29 at 12:33 +1000, Nick Piggin wrote:

>>If you don't do this, then a GFP_HIGH allocator can allocate right
>>down to its limit before it kicks kswapd, then it either will fail or
>>will have to do direct reclaim.
>>
> 
> 
> You are right if there are only GFP_HIGH requests coming in then the
> allocation will go down to (min - min/2) before kicking in kswapd.
> Though if the requester is not ready to wait, there is another good shot
> at allocation succeed before we get into direct reclaim (and this is
> happening based on can_try_harder flag).
> 

Still, it is a change in behaviour that I would rather not introduce
with a cleanup patch (and is something we don't want to introduce anyway).

So if you could fix that up it would be good.

>>How about moving the zone_statistics up into the 'if (page)'
>>test of get_page_from_freelist? This way we don't have to
>>evaluate page_zone().
>>
> 
> 
> Let us keep this as is for now.  Will revisit once after the
> pcp_prefer_allocation patches get in place. 
> 

Well page_zone is yet another cacheline that doesn't need to be touched,
and that is introduced by this patch. But the line is likely to be hot,
and get_page_from_freelist does not have the required 'zonelist' which
I didn't notice before.

So OK, revisit this later.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-01  1:14     ` Nick Piggin
@ 2005-11-04 18:15       ` Rohit Seth
  2005-11-05  0:00         ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Rohit Seth @ 2005-11-04 18:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: akpm, linux-mm, linux-kernel

On Tue, 2005-11-01 at 12:14 +1100, Nick Piggin wrote:
> Rohit Seth wrote:
> > On Sat, 2005-10-29 at 12:33 +1000, Nick Piggin wrote:
> 
> >>If you don't do this, then a GFP_HIGH allocator can allocate right
> >>down to its limit before it kicks kswapd, then it either will fail or
> >>will have to do direct reclaim.
> >>
> > 
> > 
> > You are right if there are only GFP_HIGH requests coming in then the
> > allocation will go down to (min - min/2) before kicking in kswapd.
> > Though if the requester is not ready to wait, there is another good shot
> > at allocation succeed before we get into direct reclaim (and this is
> > happening based on can_try_harder flag).
> > 
> 
> Still, it is a change in behaviour that I would rather not introduce
> with a cleanup patch (and is something we don't want to introduce anyway).
> 
> So if you could fix that up it would be good.
> 

Nick, sorry for not responding earlier.  

I agree that it is slight change in behavior from original.  I doubt
though it will impact any one in any negative way (may be for some
higher order allocations if at all). On a little positive side, less
frequent calls to kswapd for some cases and clear up the code a little
bit.

But I really don't want to get stuck here. The pcp traversal and
flushing is where I want to go next.  

Thanks,
-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-04 18:15       ` Rohit Seth
@ 2005-11-05  0:00         ` Nick Piggin
  0 siblings, 0 replies; 45+ messages in thread
From: Nick Piggin @ 2005-11-05  0:00 UTC (permalink / raw)
  To: Rohit Seth; +Cc: akpm, linux-mm, linux-kernel

Rohit Seth wrote:

> 
> Nick, sorry for not responding earlier.  
> 

That's OK.

> I agree that it is slight change in behavior from original.  I doubt
> though it will impact any one in any negative way (may be for some
> higher order allocations if at all). On a little positive side, less
> frequent calls to kswapd for some cases and clear up the code a little
> bit.
> 

I really don't want a change of behaviour going in with this,
especially not one which I would want to revert anyway. But
don't get hung up with it - when you post your latest patch
I will make a patch for the changes I would like to see for it
and synch things up.

> But I really don't want to get stuck here. The pcp traversal and
> flushing is where I want to go next.  
> 

Sure, hope it goes well!

Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-29  1:33 Rohit, Seth
  2005-10-29  2:33 ` Nick Piggin
@ 2005-10-30  0:16 ` Paul Jackson
  2005-10-31 19:09   ` Rohit Seth
  2005-11-05 17:09   ` Andi Kleen
  2005-10-30  1:47 ` Paul Jackson
  2 siblings, 2 replies; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  0:16 UTC (permalink / raw)
  To: Rohit, Seth; +Cc: akpm, linux-mm, linux-kernel

Seth wroteL
> @@ -851,19 +853,11 @@
>  	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
>  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
>  	 */
> -	for (i = 0; (z = zones[i]) != NULL; i++) {
> -		if (!zone_watermark_ok(z, order, z->pages_min,
> -				       classzone_idx, can_try_harder,
> -				       gfp_mask & __GFP_HIGH))
> -			continue;
> -
> -		if (wait && !cpuset_zone_allowed(z, gfp_mask))
> -			continue;
> -
> -		page = buffered_rmqueue(z, order, gfp_mask);
> -		if (page)
> -			goto got_pg;
> -	}
> +	if (!wait)
> +		page = get_page_from_freelist(gfp_mask, order, zones, 
> +						can_try_harder);

Thanks for the clean-up work.  Good stuff.

I think you've changed the affect that the cpuset check has on the
above pass.

As you know, the above is the last chance we have for GFP_ATOMIC (can't
wait) allocations before getting into the oom_kill code.  The code had
been written to ignore cpuset constraints for GFP_ATOMIC (that is,
"!wait") allocations.  The intent is to allow taking GFP_ATOMIC memory
from any damn node we can find it on, rather than start killing.

Your change will call into get_page_from_freelist() in such cases,
where the cpuset check is still done.

I would be tempted instead to:
 1) pass 'can_try_harder' value of -1, instead of the the local value
    of 1 (which it certainly is, since we are in !wait code).
 2) condition the cpuset check in get_page_from_freelist() on
    can_try_harder being not equal to -1.

The item (2) -does- change the existing cpuset conditions as well,
allowing cpuset boundaries to be violated for the cases that would
"allow future memory freeing" (such as GFP_MEMALLOC or TIF_MEMDIE),
whereas until now, we did not allow violating cpuset conditions
for this.  But that is arguably a good change.

The following patch, on top of yours, shows what I have in mind here:

--- 2.6.14-rc5-mm1.orig/mm/page_alloc.c	2005-10-29 14:45:07.000000000 -0700
+++ 2.6.14-rc5-mm1/mm/page_alloc.c	2005-10-29 16:35:55.000000000 -0700
@@ -777,7 +777,7 @@ get_page_from_freelist(unsigned int __no
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
-		if (!cpuset_zone_allowed(z, gfp_mask))
+		if (can_try_harder != -1 && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
 		if ((can_try_harder >= 0) && 
@@ -940,8 +940,7 @@ restart:
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	if (!wait)
-		page = get_page_from_freelist(gfp_mask, order, zones, 
-						can_try_harder);
+		page = get_page_from_freelist(gfp_mask, order, zones, -1);
 	if (page)
 		goto got_pg;
 

However ...
 1) The above also would change __GFP_HIGH and rt allocations to also
    ignore mins entirely, instead of just going deeper into reserves,
    on this pass.  That is likely not good.
 2) I can't get my head wrapped around Nick's reply to this patch.

So my above patch is no doubt flawed in one or more ways.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  0:16 ` Paul Jackson
@ 2005-10-31 19:09   ` Rohit Seth
  2005-11-05 17:09   ` Andi Kleen
  1 sibling, 0 replies; 45+ messages in thread
From: Rohit Seth @ 2005-10-31 19:09 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, linux-mm, linux-kernel

On Sat, 2005-10-29 at 17:16 -0700, Paul Jackson wrote:
> Seth wroteL
> > @@ -851,19 +853,11 @@
> >  	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
> >  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> >  	 */
> > -	for (i = 0; (z = zones[i]) != NULL; i++) {
> > -		if (!zone_watermark_ok(z, order, z->pages_min,
> > -				       classzone_idx, can_try_harder,
> > -				       gfp_mask & __GFP_HIGH))
> > -			continue;
> > -
> > -		if (wait && !cpuset_zone_allowed(z, gfp_mask))
> > -			continue;
> > -
> > -		page = buffered_rmqueue(z, order, gfp_mask);
> > -		if (page)
> > -			goto got_pg;
> > -	}
> > +	if (!wait)
> > +		page = get_page_from_freelist(gfp_mask, order, zones, 
> > +						can_try_harder);
> 
> Thanks for the clean-up work.  Good stuff.
> 
> I think you've changed the affect that the cpuset check has on the
> above pass.
> 
> As you know, the above is the last chance we have for GFP_ATOMIC (can't
> wait) allocations before getting into the oom_kill code.  The code had
> been written to ignore cpuset constraints for GFP_ATOMIC (that is,
> "!wait") allocations.  The intent is to allow taking GFP_ATOMIC memory
> from any damn node we can find it on, rather than start killing.
> 
> Your change will call into get_page_from_freelist() in such cases,
> where the cpuset check is still done.


Shooo....I will fix it.

-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  0:16 ` Paul Jackson
  2005-10-31 19:09   ` Rohit Seth
@ 2005-11-05 17:09   ` Andi Kleen
  2005-11-06  4:18     ` Paul Jackson
  1 sibling, 1 reply; 45+ messages in thread
From: Andi Kleen @ 2005-11-05 17:09 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, linux-mm, linux-kernel

Paul Jackson <pj@sgi.com> writes:


Regarding cpumemset and alloc_pages. I recently rechecked
the cpumemset hooks in there and I must say they turned out
to be quite worse

In hindsight it would have been better to use the "generate
zonelists for all possible nodes" approach you originally had
and which I rejected (sorry) 

That would make the code much cleaner and faster.
Maybe it's not too late to switch for that?

If not then the fast path definitely needs to be tuned a bit.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-05 17:09   ` Andi Kleen
@ 2005-11-06  4:18     ` Paul Jackson
  2005-11-06 17:35       ` Andi Kleen
  0 siblings, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-11-06  4:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-mm, linux-kernel

> Regarding cpumemset and alloc_pages. I recently rechecked
> the cpumemset hooks in there and I must say they turned out
> to be quite worse
> 
> In hindsight it would have been better to use the "generate
> zonelists for all possible nodes" approach you originally had
> and which I rejected (sorry) 
> 
> That would make the code much cleaner and faster.
> Maybe it's not too late to switch for that?
> 
> If not then the fast path definitely needs to be tuned a bit

(What Andi calls cpumemsets are what others call cpusets;
 I faked him out with a threatened name change that never
 happened. -pj)

Sure, this could be changed to the other scheme if we wanted.  Only
internals of the cpuset, mempolicy, and page_alloc code are affected.

I see only small differences in performance, between the two choices,
in most cases.

The current code in the kernel does the following:
  1) The cpuset_update_current_mems_allowed() calls in the
     various alloc_page*() paths in mm/mempolicy.c:
	* take the task_lock spinlock on the current task
	* compare the tasks mems_generation to that in its cpuset
  2) The first cpuset_zone_allowed() call or two, near the top
     of mm/page_alloc.c:__alloc_pages():
	* check in_interrupt()
	* check if the zone's node is set in task->mems_allowed

This task_lock spinlock, or some performance equivalent, is, I think,
unavoidable.

An essential difference between mempolicy and cpusets is that cpusets
supports outside manipulation of a tasks memory placement.  Sooner or
later, the task has to synchronize with these outside changes, and a
task_lock(current) in the path to __alloc_pages() is the lowest cost
way I could find to do this.

The one functional difference I know of affects mempolicy MPOL_BIND,
not cpusets.  This subset zonelist proposal provided higher quality
zonelists to MPOL_BIND, properly sorted to start the search on the
faulting node and working outward.

===

The alternative approach that Andi refers to, along with some analysis
of its tradeoffs, is visible at:

  http://lkml.org/lkml/2004/8/2/256
  Date: Mon, 2 Aug 2004 16:35:06 -0700 (PDT)
  Subject: [PATCH] subset zonelists and big numa friendly mempolicy MPOL_MBIND

Instead of passing a zonelist to __alloc_pages() with all the
systems nodes, and skipping over those zones on nodes not allowed,
this alternative would build custom zonelists for each cpuset and
each MPOL_BIND mempolicy, containing just the allowed nodes.  A set
of zonelists is needed, one for each node in the cpuset, each one
sorted differently so that we search for memory on the nodes closest
to the faulting node first.  This set of zonelists has to be packed
in, or otherwise they could eat alot of memory on large systems,
since they are N-squared in size, for the number of nodes in the set.

This should make Christoph Lameter happer too.  He has been complaining
that the zonelists used by MPOL_BIND are not properly sorted to
look on the nodes nearest the current one first.  The fancy subset
zonelists of my proposal provide properly sorted zonelists for
MPOL_BIND policies.

Work that would need to be done for this alternative:

 * recode __GFP_HARDWALL to replace the initial short zonelist
   with a longer zonelist if need be, and restart __alloc_pages.
 * convert some code in this old patch from bitmaps to nodemasks
 * add the cpuset hooks to these subset zonelists (the above patch
   just has the infrastructure and MPOL_BIND hooks)

Comparative benefits between the current implementation and this
alternative:

  Neutral:
    Systems with just one or a few memory nodes on the system won't
    notice much.

  Neutral:
    When there's enough free memory on the current node to meet the
    request, it won't make much difference.  A free page on the
    current node is found quickly enough, either way.

  Alternative wins:
    The limit case of searching across many nodes on a large system
    that is short of memory should save a few CPU cycles with this
    alternative.

  Alternative wins:
    The instruction cache footprint of __alloc_pages, when one is
    short of memory, should be a little smaller with this alternative.

  Current way wins:
    The kernel text size would likely grow a bit, as the code to pack
    zonelists is a couple pages.

  Alternative wins:
    It provides properly sorted zonelists for MPOL_BIND.

  Current way wins:
    It's less work - we've already got it.

  Alternative wins:
    Code esthetics - I haven't coded it, but I suspect that the
    alternative has a smaller, prettier impact on __alloc_pages().
    From the above Aug 2004 patch, one can see that the alternative
    definitely reduces the code size of mm/mempolicy.c, by removing
    some specialized MPOL_BIND code.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-06  4:18     ` Paul Jackson
@ 2005-11-06 17:35       ` Andi Kleen
  2005-11-06 20:49         ` Paul Jackson
  0 siblings, 1 reply; 45+ messages in thread
From: Andi Kleen @ 2005-11-06 17:35 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, linux-mm, linux-kernel

On Sunday 06 November 2005 05:18, Paul Jackson wrote:

> The current code in the kernel does the following:
>   1) The cpuset_update_current_mems_allowed() calls in the
>      various alloc_page*() paths in mm/mempolicy.c:
> 	* take the task_lock spinlock on the current task

That needs to go imho. At least for the common "cpusets compiled in, but not 
used" case. We already have too many locks. Even with cpusets - why
can't you test that generation lockless?

> 	* compare the tasks mems_generation to that in its cpuset

>   2) The first cpuset_zone_allowed() call or two, near the top
>      of mm/page_alloc.c:__alloc_pages():
> 	* check in_interrupt()
> 	* check if the zone's node is set in task->mems_allowed

It's also too slow for the common "compiled in but not used" case.

I did a simple patch for that - at least skip all the loops when there
is no cpuset - but it got lost in a disk crash.

> This task_lock spinlock, or some performance equivalent, is, I think,
> unavoidable.

why?

>
> An essential difference between mempolicy and cpusets is that cpusets
> supports outside manipulation of a tasks memory placement.  

Yes, that is their big problem (there is a reason I'm always complaining
about attempts to change mempolicy externally) 

But actually some mempolicy can be already changed outside the task - 
using VMA policy.

> Sooner or 
> later, the task has to synchronize with these outside changes, and a
> task_lock(current) in the path to __alloc_pages() is the lowest cost
> way I could find to do this.

Why can't it just test that generation number lockless after testing
if there is a cpuset at all?

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-06 17:35       ` Andi Kleen
@ 2005-11-06 20:49         ` Paul Jackson
  2005-11-07  2:57           ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-11-06 20:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-mm, linux-kernel

Andi wrote:
> > The current code in the kernel does the following:
> >   1) The cpuset_update_current_mems_allowed() calls in the
> >      various alloc_page*() paths in mm/mempolicy.c:
> > 	* take the task_lock spinlock on the current task
> 
> That needs to go imho.

The comment for refresh_mems(), where this is happening, explains
why this lock is needed:

 * The task_lock() is required to dereference current->cpuset safely.
 * Without it, we could pick up the pointer value of current->cpuset
 * in one instruction, and then attach_task could give us a different
 * cpuset, and then the cpuset we had could be removed and freed,
 * and then on our next instruction, we could dereference a no longer
 * valid cpuset pointer to get its mems_generation field.

Hmmm ... on second thought ... damn ... you're right.

I can just flat out remove that task_lock - without penalty.

It's *OK* if I dereference a no longer valid cpuset pointer to get
its (used to be) mems_generation field.  Either that field will have
already changed, or it won't.

    If it has changed to some other value (doesn't matter what value)
    I will realize (quite correctly) that my cpuset has changed out
    from under me, and go into the slow path code to lock things down
    and update things as need be.

    If it has not changed yet (far the more likely case) then I will
    have just missed by the skin of my teeth catching this cpuset
    change this time around.  Which is just fine.  I will catch it next
    time this task allocates memory, for sure, as I will be using the
    new cpuset pointer value the next time, for sure.

Patch coming soon to remove that task_lock/task_unlock.

Thanks.

> At least for the common "cpusets compiled in, but not 
> used" case.

Hmmm ... that comment got me to thinking too.  I could have a global
kernel flag "cpusets_have_been_used", initialized to zero (0), set to
one (1) anytime someone creates or modifies a cpuset.  Then most of my
hooks, in places like fork and exit, could collapse to really trivial
inline lockless code if cpusets have not been used yet since the system
booted.

Would you be interested in seeing such a patch?

This should even apply to the more interesting case of
cpuset_zone_allowed(), which is called for each iteration of each
zone loop in __alloc_pages().  That would be a nice win for those
systems making no active use of cpusets.

===

What about the other part of your initial question - replacing the
cpuset_zone_allowed() hooks in __alloc_pages() with code to build
custom zonelists?

I did my best in my previous reply to spell out the pluses and minuses
of that change and what work would be involved.

What's your recommendation on whether to do that or not?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-06 20:49         ` Paul Jackson
@ 2005-11-07  2:57           ` Nick Piggin
  2005-11-07  3:42             ` Andi Kleen
  2005-11-07  3:44             ` Paul Jackson
  0 siblings, 2 replies; 45+ messages in thread
From: Nick Piggin @ 2005-11-07  2:57 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Andi Kleen, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> Andi wrote:
> 
>>>The current code in the kernel does the following:
>>>  1) The cpuset_update_current_mems_allowed() calls in the
>>>     various alloc_page*() paths in mm/mempolicy.c:
>>>	* take the task_lock spinlock on the current task
>>
>>That needs to go imho.
> 
> 
> The comment for refresh_mems(), where this is happening, explains
> why this lock is needed:
> 
>  * The task_lock() is required to dereference current->cpuset safely.
>  * Without it, we could pick up the pointer value of current->cpuset
>  * in one instruction, and then attach_task could give us a different
>  * cpuset, and then the cpuset we had could be removed and freed,
>  * and then on our next instruction, we could dereference a no longer
>  * valid cpuset pointer to get its mems_generation field.
> 
> Hmmm ... on second thought ... damn ... you're right.
> 
> I can just flat out remove that task_lock - without penalty.
> 
> It's *OK* if I dereference a no longer valid cpuset pointer to get
> its (used to be) mems_generation field.  Either that field will have
> already changed, or it won't.
> 

I don't think so because if the cpuset can be freed, then its page
might be unmapped from the kernel address space if use-after-free
debugging is turned on. And this is a use after free :)

Also, it may be reused for something else far into the future without
having its value changed - is this OK?

Anyway, I think the first problem is a showstopper. I'd look into
Hugh's SLAB_DESTROY_BY_RCU for this, which sounds like a good fit
if you need to go down this path (although I only had a quick skim
over the cpusets code).

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07  2:57           ` Nick Piggin
@ 2005-11-07  3:42             ` Andi Kleen
  2005-11-07  4:37               ` Paul Jackson
  2005-11-07  3:44             ` Paul Jackson
  1 sibling, 1 reply; 45+ messages in thread
From: Andi Kleen @ 2005-11-07  3:42 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Paul Jackson, akpm, linux-mm, linux-kernel

On Monday 07 November 2005 03:57, Nick Piggin wrote:

>
> I don't think so because if the cpuset can be freed, then its page
> might be unmapped from the kernel address space if use-after-free
> debugging is turned on. And this is a use after free :)

RCU could be used to avoid that. Just only free it in a RCU callback.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07  3:42             ` Andi Kleen
@ 2005-11-07  4:37               ` Paul Jackson
  2005-11-07  6:08                 ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-11-07  4:37 UTC (permalink / raw)
  To: Andi Kleen; +Cc: nickpiggin, akpm, linux-mm, linux-kernel

Nick wrote:
> Anyway, I think the first problem is a showstopper. I'd look into
> Hugh's SLAB_DESTROY_BY_RCU for this ...

Andi wrote:
> RCU could be used to avoid that. Just only free it in a RCU callback.


... looking at mm/slab.h and rcupdate.h for the first time ... 

Would this mean that I had to put the cpuset structures on their own
slab cache, marked SLAB_DESTROY_BY_RCU?

And is the pair of operators:
  task_lock(current), task_unlock(current)
really that much worse than the pair of operatots
  rcu_read_lock, rcu_read_unlock
which apparently reduce to:
  preempt_disable, preempt_enable

Would this work something like the following?  Say task A, on processor
AP, is trying to dereference its cpuset pointer, while task B, on
processor BP, is trying hard to destroy that cpuset. Then if task A
wraps its reference in <rcu_read_lock, rcu_read_unlock>, this will keep
the RCU freeing of that memory from completing, until interrupts on AP
are re-enabled.

For that matter, if I just put cpuset structs in their own slab
cache, would that be sufficient.

  Nick - Does use-after-free debugging even catch use of objects
	 returned to their slab cache?

What about the other suggestions, Andi:
 1) subset zonelists (which you asked to reconsider)
 2) a kernel flag "cpusets_have_been_used" flag to short circuit
    cpuset logic on systems not using cpusets.



-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07  4:37               ` Paul Jackson
@ 2005-11-07  6:08                 ` Nick Piggin
  2005-11-07  9:46                   ` Paul Jackson
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-11-07  6:08 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Andi Kleen, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> Nick wrote:
> 
>>Anyway, I think the first problem is a showstopper. I'd look into
>>Hugh's SLAB_DESTROY_BY_RCU for this ...
> 
> 
> Andi wrote:
> 
>>RCU could be used to avoid that. Just only free it in a RCU callback.
> 
> 
> 
> ... looking at mm/slab.h and rcupdate.h for the first time ... 
> 

Yeah, take a look at rmap.c as well, and some of the comments in
changelogs if you need a better feel for it.

Basically SLAB_DESTROY_BY_RCU will allow the entries to be freed
back to the slab for reuse, but will not allow the slab caches to
be freed back to the page allocator inside rcu readside.

So your cpusets may be reused, but only as new cpusets. This should
be no problem at all for you.

> Would this mean that I had to put the cpuset structures on their own
> slab cache, marked SLAB_DESTROY_BY_RCU?
> 
> And is the pair of operators:
>   task_lock(current), task_unlock(current)
> really that much worse than the pair of operatots
>   rcu_read_lock, rcu_read_unlock
> which apparently reduce to:
>   preempt_disable, preempt_enable
> 

You may also have to be careful about memory ordering when setting
a pointer which may be concurrently dereferenced by another CPU so
that stale data doesn't get picked up.

The set side needs an rcu_assign_pointer, and the dereference side
needs rcu_dereference. Unless you either don't care about races, or
already have the correct barriers in place. But it is better to be
safe.

> Would this work something like the following?  Say task A, on processor
> AP, is trying to dereference its cpuset pointer, while task B, on
> processor BP, is trying hard to destroy that cpuset. Then if task A
> wraps its reference in <rcu_read_lock, rcu_read_unlock>, this will keep
> the RCU freeing of that memory from completing, until interrupts on AP
> are re-enabled.
> 

Sounds like it should work.

> For that matter, if I just put cpuset structs in their own slab
> cache, would that be sufficient.
> 

No, because the slab caches can get freed back into the general
page allocator at any time.

>   Nick - Does use-after-free debugging even catch use of objects
> 	 returned to their slab cache?
> 

Yes (slab debugging catches write-after-free at least, I believe),
however there are exceptions made for RCU freed slab caches. That
is: it is acceptable to access a freed RCU slab object, especially
if you only read it (writes need to be more careful, but they're
possible in some situations).

> What about the other suggestions, Andi:
>  1) subset zonelists (which you asked to reconsider)
>  2) a kernel flag "cpusets_have_been_used" flag to short circuit
>     cpuset logic on systems not using cpusets.
> 

Not too sure at present. I think #1 might be a good idea though
it would be a bigger change. #2 again might be a good hack for
the time being, although it would be nice to try to get the same
performance from the normal cpuset fastpath.

My RCU suggestion was mainly an idea to get around your immediate
problem with a lockless fastpath, rather than advocating it over
any of the alternatives.

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07  6:08                 ` Nick Piggin
@ 2005-11-07  9:46                   ` Paul Jackson
  2005-11-07 10:17                     ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-11-07  9:46 UTC (permalink / raw)
  To: Nick Piggin; +Cc: ak, akpm, linux-mm, linux-kernel

Nick wrote:
> Yeah, take a look at rmap.c as well, and some of the comments in
> changelogs if you need a better feel for it.

Ok - thanks.


> So your cpusets may be reused, but only as new cpusets. This should
> be no problem at all for you.

Correct - should be no problem.


> > And is the pair of operators:
> >   task_lock(current), task_unlock(current)
> > really that much worse than the pair of operators
> >   ...
> >   preempt_disable, preempt_enable

That part still surprises me a little.  Is there enough difference in
the performance between:

  1) task_lock, which is a spinlock on current->alloc_lock and
  2) rcu_read_lock, which is .preempt_count++; barrier()

to justify a separate slab cache for cpusets and a little more code?

For all I know (not much) the task_lock might actually be cheaper ;).


> You may also have to be careful about memory ordering when setting
> a pointer which may be concurrently dereferenced by another CPU so
> that stale data doesn't get picked up.
> 
> The set side needs an rcu_assign_pointer, and the dereference side
> needs rcu_dereference. Unless you either don't care about races,

I don't think I care ...  I'm just sampling task->cpuset->mems_generation,
looking for it to change.  Sooner or later, after it changes, I will get
an accurate read of it, realized it changed, and immediately down a
cpuset semaphore and reread all values of interest.

The semaphore down means doing an atomic_dec_return(), which imposes
a memory barrier, right?


> My RCU suggestion was mainly an idea to get around your immediate
> problem with a lockless fastpath, rather than advocating it over
> any of the alternatives.

Understood.  Thanks for your comments on the alternatives - they
seem reasonable.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07  9:46                   ` Paul Jackson
@ 2005-11-07 10:17                     ` Nick Piggin
  2005-11-07 14:41                       ` Paul Jackson
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-11-07 10:17 UTC (permalink / raw)
  To: Paul Jackson; +Cc: ak, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> Nick wrote:

>>>And is the pair of operators:
>>>  task_lock(current), task_unlock(current)
>>>really that much worse than the pair of operators
>>>  ...
>>>  preempt_disable, preempt_enable
> 
> 
> That part still surprises me a little.  Is there enough difference in
> the performance between:
> 
>   1) task_lock, which is a spinlock on current->alloc_lock and
>   2) rcu_read_lock, which is .preempt_count++; barrier()
> 
> to justify a separate slab cache for cpusets and a little more code?
> 
> For all I know (not much) the task_lock might actually be cheaper ;).
> 

But on a preempt kernel the spinlock must disable preempt as well!

Not to mention that a spinlock is an atomic op (though that is getting
cheaper these days) + 2 memory barriers (getting more expensive).

> The semaphore down means doing an atomic_dec_return(), which imposes
> a memory barrier, right?
> 

Yep.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07 10:17                     ` Nick Piggin
@ 2005-11-07 14:41                       ` Paul Jackson
  0 siblings, 0 replies; 45+ messages in thread
From: Paul Jackson @ 2005-11-07 14:41 UTC (permalink / raw)
  To: Nick Piggin; +Cc: ak, akpm, linux-mm, linux-kernel

Ok ... so a spinlock (task_lock) costs one barrier and one atomic more
than an rcu_read_lock (on a preempt kernel where both spinlock and
rcu_read_lock cost a preempt), or something like that.

Thanks, Nick.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-11-07  2:57           ` Nick Piggin
  2005-11-07  3:42             ` Andi Kleen
@ 2005-11-07  3:44             ` Paul Jackson
  1 sibling, 0 replies; 45+ messages in thread
From: Paul Jackson @ 2005-11-07  3:44 UTC (permalink / raw)
  To: Nick Piggin; +Cc: ak, akpm, linux-mm, linux-kernel

Nick wrote:
> I don't think so because if the cpuset can be freed, then its page
> might be unmapped from the kernel address space if use-after-free
> debugging is turned on. And this is a use after free :)

Yup - that is a showstopper.  If dereferencing a stale pointer, even if
one doesn't really care what is read, is a no-no, then this is a no-no.

Thanks, Nick, for catching this.

This puts more value on the other idea I had -  a global kernel flag
"cpusets_have_been_used", that could be used to short circuit all the
cpuset hooks on systems that never mucked with cpusets.

For any lurkers wondering why I am chasing stale pointers when I don't
care what I read, it's like this.  Essentially, the task doing this
read is looking for an asychronous incoming level triggered signal
(going from the two mems_generations being equal to them being unequal),
that in this case is coming in at about the same time we are sampling
for it.  Whether we realize this time that the signal came in, or
don't realize it until the next time we sample, doesn't really matter
to us.  One way or the other, we'll see it, for sure the next sample if
not this one.  So the details of what happened on this read (so long as
no one got annoyed that we tried to chase a stale pointer) don't really
matter.  Unfortunately, Nick reminds us that someone will get annoyed.
Oh well.

> Also, it may be reused for something else far into the future without
> having its value changed - is this OK?

That part would be ok.  If I failed to realize that the underlying
cpuset had changed this time through __alloc_pages(), I would see it
next time, when I picked up a fresh and useful copy of my task->cpuset
pointer, having long forgotten my stale copy.  My stale cpuset pointer
only had a lifetime of a couple machine instructions.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-29  1:33 Rohit, Seth
  2005-10-29  2:33 ` Nick Piggin
  2005-10-30  0:16 ` Paul Jackson
@ 2005-10-30  1:47 ` Paul Jackson
  2005-10-30  2:01   ` Nick Piggin
  2005-10-31 21:20   ` Rohit Seth
  2 siblings, 2 replies; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  1:47 UTC (permalink / raw)
  To: Rohit, Seth; +Cc: akpm, linux-mm, linux-kernel

A couple more items:
 1) Lets try for a consistent use of type "gfp_t" for gfp_mask.
 2) The can_try_harder flag values were driving me nuts.
 3) The "inline" you added to buffered_rmqueue() blew up my compile.
 4) The return from try_to_free_pages() was put in "i" for no evident reason.
 5) I have no clue what the replenish flag you added to buffered_rmqueue does.

You patch has:
> can_try_harder can have following 
>  * values:
>  * -1 => No need to check for the watermarks.
>  *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
>  *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)

Later on, you have an inequality test on this value:
	if ((can_try_harder >= 0)
and a non-zero test:
	if (can_try_harder)

That's three magic values, not even in increasing order of "how hard
one should try", tested a couple of different ways that requires
absorbing the complete details of the three values and their ordering
before one can read the code.

I'd like to use a enum type, to put names to these 3 values, and I'd
like to put them in order of increasing desperation.

The following patch tweaks the gfp_mask type, makes can_try_harder
an enum, removes the inline from buffered_rmqueue(), doesn't stash
the return value of try_to_free_pages(), and removes buffered_rmqueue's
replenish flag.  It applies on top of your patch.  If you like it, I'd
be happiest if you picked it up and incorporated it into your work.

I compiled it once, never tested or booted.

Signed-off-by: Paul Jackson <pj@sgi.com>

---

 include/linux/mmzone.h |    9 ++++++++-
 mm/page_alloc.c        |   39 +++++++++++++++++++--------------------
 mm/vmscan.c            |    9 ++++++---
 3 files changed, 33 insertions(+), 24 deletions(-)

--- 2.6.14-rc5-mm1-cpuset-patches.orig/include/linux/mmzone.h	2005-10-25 09:37:45.154588634 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/include/linux/mmzone.h	2005-10-29 17:58:28.433716118 -0700
@@ -330,8 +330,15 @@ void get_zone_counts(unsigned long *acti
 			unsigned long *free);
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone, int order);
+
+typedef enum {
+  dont_try_harder,	/* Don't go much below low watermark (GFP_HIGH) */
+  try_harder,		/* Go far below low watermark (RT TASK) */
+  try_really_hard	/* Ignore all watermarks */
+} can_try_harder_t;
+
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		int alloc_type, int can_try_harder, int gfp_high);
+	int alloc_type, can_try_harder_t can_try_harder, int gfp_high);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
--- 2.6.14-rc5-mm1-cpuset-patches.orig/mm/page_alloc.c	2005-10-29 14:44:55.371576062 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/mm/page_alloc.c	2005-10-29 18:40:41.358953388 -0700
@@ -713,8 +713,8 @@ static inline void prep_zero_page(struct
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-struct inline page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags, int replenish)
+struct page *
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page = NULL;
@@ -725,7 +725,7 @@ buffered_rmqueue(struct zone *zone, int 
 
 		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
-		if ((pcp->count <= pcp->low) && replenish)
+		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
 						pcp->batch, &pcp->list);
 		if (pcp->count) {
@@ -756,16 +756,12 @@ buffered_rmqueue(struct zone *zone, int 
 }
 
 /* get_page_from_freeliest loops through all the possible zones
- * to find out if it can allocate a page.  can_try_harder can have following 
- * values:
- * -1 => No need to check for the watermarks.
- *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
- *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)
+ * to find out if it can allocate a page.
  */
 
 static struct page *
-get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order, 
-			struct zone **zones, int can_try_harder)
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+			struct zone **zones, can_try_harder_t can_try_harder)
 {
 	struct zone *z;
 	struct page *page = NULL;
@@ -780,13 +776,13 @@ get_page_from_freelist(unsigned int __no
 		if (!cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
-		if ((can_try_harder >= 0) && 
+		if ((can_try_harder < try_really_hard) &&
 			(!zone_watermark_ok(z, order, z->pages_low,
 				       classzone_idx, can_try_harder, 
 				       gfp_mask & __GFP_HIGH)))
 			continue;
 
-		page = buffered_rmqueue(z, order, gfp_mask, 1);
+		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page) 
 			break;
 	}
@@ -798,7 +794,7 @@ get_page_from_freelist(unsigned int __no
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		      int classzone_idx, int can_try_harder, int gfp_high)
+	int classzone_idx, can_try_harder_t can_try_harder, int gfp_high)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
@@ -806,7 +802,7 @@ int zone_watermark_ok(struct zone *z, in
 
 	if (gfp_high)
 		min -= min / 2;
-	if (can_try_harder)
+	if (can_try_harder >= try_harder)
 		min -= min / 4;
 
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -878,7 +874,7 @@ static inline void __stack_trace(struct 
 }
 
 static inline void set_page_owner(struct page *page,
-			unsigned int order, unsigned int gfp_mask)
+			unsigned int order, gfp_t gfp_mask)
 {
 	unsigned long address, bp;
 #ifdef CONFIG_X86_64
@@ -906,7 +902,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 	struct task_struct *p = current;
 	int i;
 	int do_retry;
-	int can_try_harder;
+	can_try_harder_t can_try_harder = dont_try_harder;
 
 	might_sleep_if(wait);
 
@@ -915,7 +911,8 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 	 * cannot run direct reclaim, or is the caller has realtime scheduling
 	 * policy
 	 */
-	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+		can_try_harder = try_harder;
 
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
@@ -924,7 +921,8 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 		return NULL;
 	}
 restart:
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0);
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones,
+					dont_try_harder);
 	if (page)
 		goto got_pg;
 
@@ -951,7 +949,8 @@ restart:
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, order, zones, -1);
+			page = get_page_from_freelist(gfp_mask, order, zones,
+						try_really_hard);
 			if (page)
 				goto got_pg;
 		}
@@ -970,7 +969,7 @@ rebalance:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	i = try_to_free_pages(zones, gfp_mask);
+	try_to_free_pages(zones, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
--- 2.6.14-rc5-mm1-cpuset-patches.orig/mm/vmscan.c	2005-10-25 09:38:47.225573361 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/mm/vmscan.c	2005-10-29 17:21:39.268886029 -0700
@@ -1082,7 +1082,8 @@ loop_again:
 					continue;
 
 				if (!zone_watermark_ok(zone, order,
-						zone->pages_high, 0, 0, 0)) {
+						zone->pages_high, 0, 0,
+						dont_try_harder)) {
 					end_zone = i;
 					goto scan;
 				}
@@ -1119,7 +1120,8 @@ scan:
 
 			if (nr_pages == 0) {	/* Not software suspend */
 				if (!zone_watermark_ok(zone, order,
-						zone->pages_high, end_zone, 0, 0))
+						zone->pages_high, end_zone,
+						0, dont_try_harder))
 					all_zones_ok = 0;
 			}
 			zone->temp_priority = priority;
@@ -1267,7 +1269,8 @@ void wakeup_kswapd(struct zone *zone, in
 		return;
 
 	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0,
+					dont_try_harder))
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;


-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  1:47 ` Paul Jackson
@ 2005-10-30  2:01   ` Nick Piggin
  2005-10-30  2:19     ` Paul Jackson
  2005-10-30  2:26     ` Paul Jackson
  2005-10-31 21:20   ` Rohit Seth
  1 sibling, 2 replies; 45+ messages in thread
From: Nick Piggin @ 2005-10-30  2:01 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Rohit, Seth, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> A couple more items:
>  1) Lets try for a consistent use of type "gfp_t" for gfp_mask.
>  2) The can_try_harder flag values were driving me nuts.

Please instead use a second argument 'gfp_high', which will nicely
match zone_watermark_ok, and use that consistently when converting
__alloc_pages code to use get_page_from_freelist. Ie. keep current
behaviour.

That would solve my issues with the patch.

>  3) The "inline" you added to buffered_rmqueue() blew up my compile.

How? Why? This should be solved because a future possible feature
(early allocation from pcp lists) will want inlining in order to
propogate the constant 'replenish' argument.

>  4) The return from try_to_free_pages() was put in "i" for no evident reason.
>  5) I have no clue what the replenish flag you added to buffered_rmqueue does.
> 

Slight patch mis-split I guess. For the cleanup patch, you're right,
this should be removed.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  2:01   ` Nick Piggin
@ 2005-10-30  2:19     ` Paul Jackson
  2005-10-30  2:32       ` Nick Piggin
  2005-10-30  2:26     ` Paul Jackson
  1 sibling, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  2:19 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Nick, replying to pj:
> >  3) The "inline" you added to buffered_rmqueue() blew up my compile.
> 
> How? Why? This should be solved because a future possible feature
> (early allocation from pcp lists) will want inlining in order to
> propogate the constant 'replenish' argument.

If I make the following change to a copy of mm/page_alloc.c:

=================================
--- 2.6.14-rc5-mm1-cpuset-patches.orig/mm/page_alloc.c  2005-10-29 19:04:13.745641793 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/mm/page_alloc.c       2005-10-29 19:04:03.085367810 -0700
@@ -713,7 +713,7 @@ static inline void prep_zero_page(struct
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
-struct page *
+struct inline page *
 buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
=================================

Then it goes from compiling ok, to failing with 61 lines of
error output, beginning with:

mm/page_alloc.c:716: error: parse error before "inline"
mm/page_alloc.c:721: error: `gfp_flags' undeclared here (not in a function)
mm/page_alloc.c:723: error: parse error before "if"
mm/page_alloc.c:726: warning: type defaults to `int' in declaration of `pcp'
mm/page_alloc.c:726: error: `zone' undeclared here (not in a function)
mm/page_alloc.c:726: error: braced-group within expression allowed only inside a function
mm/page_alloc.c:726: warning: data definition has no type or storage class
mm/page_alloc.c:726: warning: type defaults to `int' in declaration of `debug_smp_processor_id'
mm/page_alloc.c:726: warning: function declaration isn't a prototype
mm/page_alloc.c:726: error: conflicting types for `debug_smp_processor_id'
include/linux/smp.h:131: error: previous declaration of `debug_smp_processor_id'          

This is gcc 3.3.3, compiling sn2_defconfig on and for an SN2.

Perhaps "inline struct page *" would work better than "struct inline page *" ?
... yes ... that fixes my compiler complaints.

Also ... buffered_rmqueue() is a rather large function to be inlining.
And if it is inlined, then are you expecting to also have an out of
line copy, for use by the call to it from mm/swap_prefetch.c
prefetch_get_page()?

Adding the 'inline' keyword increases my kernel text size by
1448 bytes, for the extra copy of this code used inline from
the call to it from mm/page_alloc.c:get_page_from_freelist().
Is that really worth it?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  2:19     ` Paul Jackson
@ 2005-10-30  2:32       ` Nick Piggin
  2005-10-30  3:06         ` Paul Jackson
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-10-30  2:32 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> Nick, replying to pj:
> 
>>> 3) The "inline" you added to buffered_rmqueue() blew up my compile.
>>
>>How? Why? This should be solved because a future possible feature
>>(early allocation from pcp lists) will want inlining in order to
>>propogate the constant 'replenish' argument.
> 
> 
> 
> Perhaps "inline struct page *" would work better than "struct inline page *" ?
> ... yes ... that fixes my compiler complaints.
> 

Ah, yep.

> Also ... buffered_rmqueue() is a rather large function to be inlining.

It is, however there would only be 2 calls, and one I think would
also have a constant 0 for "order".

Though yeah, it may be better split another way. For this patch,
it shouldn't matter because it is static and will only have one
callsite so should be inlined anyway.

> And if it is inlined, then are you expecting to also have an out of
> line copy, for use by the call to it from mm/swap_prefetch.c
> prefetch_get_page()?
> 

No, that shouldn't be there though.

> Adding the 'inline' keyword increases my kernel text size by
> 1448 bytes, for the extra copy of this code used inline from
> the call to it from mm/page_alloc.c:get_page_from_freelist().
> Is that really worth it?
> 

Hmm, where is the other callsite? (sorry I don't have a copy
of -mm handy so I'm just looking at 2.6).

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  2:32       ` Nick Piggin
@ 2005-10-30  3:06         ` Paul Jackson
  2005-10-30  3:53           ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  3:06 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Nick, replying to pj:
> > And if it is inlined, then are you expecting to also have an out of
> > line copy, for use by the call to it from mm/swap_prefetch.c
> > prefetch_get_page()?
> > 
> 
> No, that shouldn't be there though.
> 
> > Adding the 'inline' keyword increases my kernel text size by
> > 1448 bytes, for the extra copy of this code used inline from
> > the call to it from mm/page_alloc.c:get_page_from_freelist().
> > Is that really worth it?
> > 
> 
> Hmm, where is the other callsite? 

The other callsite is mm/swap_prefetch.c:prefetch_get_page(), from Con
Kolivas's mm-implement-swap-prefetching.patch patch in *-mm, dated
about six days ago.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  3:06         ` Paul Jackson
@ 2005-10-30  3:53           ` Nick Piggin
  0 siblings, 0 replies; 45+ messages in thread
From: Nick Piggin @ 2005-10-30  3:53 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> Nick, replying to pj:
> 

>>Hmm, where is the other callsite? 
> 
> 
> The other callsite is mm/swap_prefetch.c:prefetch_get_page(), from Con
> Kolivas's mm-implement-swap-prefetching.patch patch in *-mm, dated
> about six days ago.
> 

OK, I haven't looked at those patches really. I think some of that
stuff should go into page_alloc.c and I'd prefer to keep
buffered_rmqueue static.

But no matter for the cleanup patch at hand: let's leave the inline
off, and the compiler will do the right thing if it is static and
there is just a single call site (and I think newer gccs will do
function versioning if there are constant arguments).

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  2:01   ` Nick Piggin
  2005-10-30  2:19     ` Paul Jackson
@ 2005-10-30  2:26     ` Paul Jackson
  2005-10-30  2:36       ` Nick Piggin
  1 sibling, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  2:26 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

> >  2) The can_try_harder flag values were driving me nuts.
> 
> Please instead use a second argument 'gfp_high', which will nicely
> match zone_watermark_ok, and use that consistently when converting
> __alloc_pages code to use get_page_from_freelist. Ie. keep current
> behaviour.

Well ... I still don't understand what you're suggesting, so I
guess I will have to wait for an actual patch incorporating it.

Are you also objecting to converting "can_try_harder" to an
enum, and getting the values in order of desperation?  If so,
I don't why you object.

And there is still the issue that I don't think cpuset constraints
should be applied in the last attempt before oom_killing for
GFP_ATOMIC requests.

I will await the next version of the patch, and see if it meets
my concerns.  I am missing a couple too many clues to add more
at this point.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  2:26     ` Paul Jackson
@ 2005-10-30  2:36       ` Nick Piggin
  2005-10-30  3:09         ` Paul Jackson
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-10-30  2:36 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
>>> 2) The can_try_harder flag values were driving me nuts.
>>
>>Please instead use a second argument 'gfp_high', which will nicely
>>match zone_watermark_ok, and use that consistently when converting
>>__alloc_pages code to use get_page_from_freelist. Ie. keep current
>>behaviour.
> 
> 
> Well ... I still don't understand what you're suggesting, so I
> guess I will have to wait for an actual patch incorporating it.
> 

See how can_try_harder and gfp_high is used currently. They
are simple boolean values and are easily derived from parameters
passed into __alloc_pages.

> Are you also objecting to converting "can_try_harder" to an
> enum, and getting the values in order of desperation?  If so,
> I don't why you object.
> 

Because then to get current behaviour you would have to add
branches to get the correct enum value.

> And there is still the issue that I don't think cpuset constraints
> should be applied in the last attempt before oom_killing for
> GFP_ATOMIC requests.
> 

Sure.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  2:36       ` Nick Piggin
@ 2005-10-30  3:09         ` Paul Jackson
  2005-10-30  3:55           ` Nick Piggin
  0 siblings, 1 reply; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  3:09 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Nick wrote:
> See how can_try_harder and gfp_high is used currently. 

Ah - by "current" you meant in Linus's or Andrew's tree,
not as in Seth's current patch.  Since they are booleans,
rather than tri-values, using an enum is overkill.  Ok.

Now I'm one less clue short of understanding.  Thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  3:09         ` Paul Jackson
@ 2005-10-30  3:55           ` Nick Piggin
  2005-10-30  4:11             ` Paul Jackson
  0 siblings, 1 reply; 45+ messages in thread
From: Nick Piggin @ 2005-10-30  3:55 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

Paul Jackson wrote:
> Nick wrote:
> 
>>See how can_try_harder and gfp_high is used currently. 
> 
> 
> Ah - by "current" you meant in Linus's or Andrew's tree,
> not as in Seth's current patch.  Since they are booleans,
> rather than tri-values, using an enum is overkill.  Ok.
> 

Yup.

> Now I'm one less clue short of understanding.  Thanks.
> 

I'll be more constructive next time round, and provide an
actual patch to address any of my remaining concerns after
this latest round of feedback.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  3:55           ` Nick Piggin
@ 2005-10-30  4:11             ` Paul Jackson
  0 siblings, 0 replies; 45+ messages in thread
From: Paul Jackson @ 2005-10-30  4:11 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, linux-mm, linux-kernel

> I'll be more constructive next time round ...

That's no problem.  If Seth can figure this out and
produce a good next round of patch, that's sufficient.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-30  1:47 ` Paul Jackson
  2005-10-30  2:01   ` Nick Piggin
@ 2005-10-31 21:20   ` Rohit Seth
  2005-10-31 21:28     ` Paul Jackson
  1 sibling, 1 reply; 45+ messages in thread
From: Rohit Seth @ 2005-10-31 21:20 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, linux-mm, linux-kernel

On Sat, 2005-10-29 at 18:47 -0700, Paul Jackson wrote:
> A couple more items:
>  1) Lets try for a consistent use of type "gfp_t" for gfp_mask.
>  2) The can_try_harder flag values were driving me nuts.

Not sure why?  Let me see if some new values could better articulate the
meaning.  Currently if value is < 0 then don't check the watermarks.
When we do check for watermarks, then the value of 1 indicates that it
could go below minimum value.

>  3) The "inline" you added to buffered_rmqueue() blew up my compile.

I will remove the inline based on your and Nick emails.  Though my patch
had inline before the struct....

-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static inline struct page *
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags, int
replenish)

...so that shouldn't have caused any problem.

>  4) The return from try_to_free_pages() was put in "i" for no evident reason.

Will be fixed.

>  5) I have no clue what the replenish flag you added to buffered_rmqueue does.
> 

A bit futuristic.  Will need it when pcp allocations gets checked first
(as Nick also mentioned).  Will remove it for now.

> You patch has:
> > can_try_harder can have following 
> >  * values:
> >  * -1 => No need to check for the watermarks.
> >  *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> >  *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)
> 
> Later on, you have an inequality test on this value:
> 	if ((can_try_harder >= 0)
> and a non-zero test:
> 	if (can_try_harder)

The last line is from zone_watermark_ok.  The first check is in
get_page_from_freelist.  There is no (can_try_harder) check for this
flag in that function.

> 
> That's three magic values, not even in increasing order of "how hard
> one should try", tested a couple of different ways that requires
> absorbing the complete details of the three values and their ordering
> before one can read the code.
> 



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH]: Clean up of __alloc_pages
  2005-10-31 21:20   ` Rohit Seth
@ 2005-10-31 21:28     ` Paul Jackson
  0 siblings, 0 replies; 45+ messages in thread
From: Paul Jackson @ 2005-10-31 21:28 UTC (permalink / raw)
  To: Rohit Seth; +Cc: akpm, linux-mm, linux-kernel, Nick Piggin

Rohit wrote:
> Not sure why?  Let me see if some new values could better articulate the
> meaning. 

See also Nick's comments, before going too far.  He was advocating
just using binary flags and adding a gfp_high flag, or something
like that.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

* RE: [PATCH]: Clean up of __alloc_pages
@ 2005-11-05  1:57 Seth, Rohit
  0 siblings, 0 replies; 45+ messages in thread
From: Seth, Rohit @ 2005-11-05  1:57 UTC (permalink / raw)
  To: Nick Piggin; +Cc: akpm, linux-mm, linux-kernel

From: Nick Piggin [mailto:nickpiggin@yahoo.com.au] 


>I really don't want a change of behaviour going in with this,
>especially not one which I would want to revert anyway. But
>don't get hung up with it - when you post your latest patch
>I will make a patch for the changes I would like to see for it
>and synch things up.


Well, those reverts are what I'm trying to avoid ;-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2005-11-07 14:41 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-10-01 19:00 [PATCH]: Clean up of __alloc_pages Seth, Rohit
2005-10-02  3:09 ` Nick Piggin
2005-10-03 16:50   ` Rohit Seth
2005-10-03 15:34 ` Christoph Lameter
2005-10-03 16:55   ` Rohit Seth
2005-10-03 16:57     ` Christoph Lameter
2005-10-03 17:48       ` Rohit Seth
2005-10-04 13:27         ` Andi Kleen
2005-10-04 16:26           ` Ray Bryant
2005-10-04 16:10             ` Martin J. Bligh
2005-10-04 17:02               ` Ray Bryant
2005-10-29  1:33 Rohit, Seth
2005-10-29  2:33 ` Nick Piggin
2005-10-31 20:55   ` Rohit Seth
2005-11-01  1:14     ` Nick Piggin
2005-11-04 18:15       ` Rohit Seth
2005-11-05  0:00         ` Nick Piggin
2005-10-30  0:16 ` Paul Jackson
2005-10-31 19:09   ` Rohit Seth
2005-11-05 17:09   ` Andi Kleen
2005-11-06  4:18     ` Paul Jackson
2005-11-06 17:35       ` Andi Kleen
2005-11-06 20:49         ` Paul Jackson
2005-11-07  2:57           ` Nick Piggin
2005-11-07  3:42             ` Andi Kleen
2005-11-07  4:37               ` Paul Jackson
2005-11-07  6:08                 ` Nick Piggin
2005-11-07  9:46                   ` Paul Jackson
2005-11-07 10:17                     ` Nick Piggin
2005-11-07 14:41                       ` Paul Jackson
2005-11-07  3:44             ` Paul Jackson
2005-10-30  1:47 ` Paul Jackson
2005-10-30  2:01   ` Nick Piggin
2005-10-30  2:19     ` Paul Jackson
2005-10-30  2:32       ` Nick Piggin
2005-10-30  3:06         ` Paul Jackson
2005-10-30  3:53           ` Nick Piggin
2005-10-30  2:26     ` Paul Jackson
2005-10-30  2:36       ` Nick Piggin
2005-10-30  3:09         ` Paul Jackson
2005-10-30  3:55           ` Nick Piggin
2005-10-30  4:11             ` Paul Jackson
2005-10-31 21:20   ` Rohit Seth
2005-10-31 21:28     ` Paul Jackson
2005-11-05  1:57 Seth, Rohit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox