linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH]: Cleanup of __alloc_pages
@ 2005-11-08  1:43 Rohit, Seth
  2005-11-08  1:53 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Rohit, Seth @ 2005-11-08  1:43 UTC (permalink / raw)
  To: akpm, torvalds; +Cc: linux-mm, linux-kernel

	Signed-off-by: Rohit Seth <rohit.seth@intel.com>


--- linux-2.6.14.org/mm/page_alloc.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14/mm/page_alloc.c	2005-11-07 09:37:45.000000000 -0800
@@ -707,9 +707,7 @@
 		}
 		local_irq_restore(flags);
 		put_cpu();
-	}
-
-	if (page == NULL) {
+	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
@@ -770,6 +768,45 @@
 	return 1;
 }
 
+/* get_page_from_freeliest loops through all the possible zones
+ * to find out if it can allocate a page.  can_try_harder can have following
+ * values:
+ * -1 => No need to check for the watermarks.
+ *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
+ *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)
+ *
+ * cpuset check is not performed when the skip_cpuset_chk flag is set.
+ */
+
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **zones, 
+			int can_try_harder, int skip_cpuset_chk)
+{
+	struct zone *z;
+	struct page *page = NULL;
+	int classzone_idx = zone_idx(zones[0]);
+	int i;
+
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))
+			continue;
+		if ((can_try_harder >= 0) &&
+			(!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, can_try_harder,
+				       gfp_mask & __GFP_HIGH)))
+			continue;
+
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			break;
+	}
+	return page;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -778,15 +815,13 @@
 		struct zonelist *zonelist)
 {
 	const int wait = gfp_mask & __GFP_WAIT;
-	struct zone **zones, *z;
+	struct zone **zones, *z = NULL;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int classzone_idx;
 	int do_retry;
 	int can_try_harder;
-	int did_some_progress;
 
 	might_sleep_if(wait);
 
@@ -803,42 +838,10 @@
 		/* Should this ever happen?? */
 		return NULL;
 	}
-
-	classzone_idx = zone_idx(zones[0]);
-
 restart:
-	/*
-	 * Go through the zonelist once, looking for a zone with enough free.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
-		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-			continue;
-
-		/*
-		 * If the zone is to attempt early page reclaim then this loop
-		 * will try to reclaim pages and check the watermark a second
-		 * time before giving up and falling back to the next zone.
-		 */
-zone_reclaim_retry:
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0)) {
-			if (!do_reclaim)
-				continue;
-			else {
-				zone_reclaim(z, gfp_mask, order);
-				/* Only try reclaim once */
-				do_reclaim = 0;
-				goto zone_reclaim_retry;
-			}
-		}
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0, 0);
+	if (page)
+		goto got_pg;
 
 	for (i = 0; (z = zones[i]) != NULL; i++)
 		wakeup_kswapd(z, order);
@@ -851,19 +854,11 @@
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		if (!zone_watermark_ok(z, order, z->pages_min,
-				       classzone_idx, can_try_harder,
-				       gfp_mask & __GFP_HIGH))
-			continue;
-
-		if (wait && !cpuset_zone_allowed(z, gfp_mask))
-			continue;
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
+	if (!wait)
+		page = get_page_from_freelist(gfp_mask, order, zones,
+						can_try_harder, 1);
+	if (page)
+		goto got_pg;
 
 	/* This allocation should allow future memory freeing. */
 
@@ -871,13 +866,9 @@
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
-			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z, gfp_mask))
-					continue;
-				page = buffered_rmqueue(z, order, gfp_mask);
-				if (page)
-					goto got_pg;
-			}
+			page = get_page_from_freelist(gfp_mask, order, zones, -1, 1);
+			if (page)
+				goto got_pg;
 		}
 		goto nopage;
 	}
@@ -894,47 +885,20 @@
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zones, gfp_mask);
+	try_to_free_pages(zones, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 
 	cond_resched();
 
-	if (likely(did_some_progress)) {
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_min,
-					       classzone_idx, can_try_harder,
-					       gfp_mask & __GFP_HIGH))
-				continue;
-
-			if (!cpuset_zone_allowed(z, gfp_mask))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
-	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+	page = get_page_from_freelist(gfp_mask, order, zones, can_try_harder, 0);
+	if (page)
+		goto got_pg;
+	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		/*
-		 * Go through the zonelist yet one more time, keep
-		 * very high watermark here, this is only to catch
-		 * a parallel oom killing, we must fail if we're still
-		 * under heavy pressure.
+		 * Start the OOM here.
 		 */
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_high,
-					       classzone_idx, 0, 0))
-				continue;
-
-			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
-
 		out_of_memory(gfp_mask, order);
 		goto restart;
 	}
@@ -968,7 +932,7 @@
 	}
 	return NULL;
 got_pg:
-	zone_statistics(zonelist, z);
+	zone_statistics(zonelist, page_zone(page));
 	return page;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  1:43 [PATCH]: Cleanup of __alloc_pages Rohit, Seth
@ 2005-11-08  1:53 ` Andrew Morton
  2005-11-08  2:16   ` Rohit Seth
  2005-11-08  3:07 ` Paul Jackson
  2005-11-09  0:17 ` Paul Jackson
  2 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-11-08  1:53 UTC (permalink / raw)
  To: Rohit, Seth; +Cc: torvalds, linux-mm, linux-kernel

"Rohit, Seth" <rohit.seth@intel.com> wrote:
>
> [PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
> 	1- remove the initial reclaim logic
> 	2- GFP_HIGH pages are allowed to go little below watermark sooner.
> 	3- Search for free pages unconditionally after direct reclaim.

Would it be possible to break these into three separate patches?  The
cleanup part should be #1.

> +/* get_page_from_freeliest loops through all the possible zones
> + * to find out if it can allocate a page.  can_try_harder can have following
> + * values:
> + * -1 => No need to check for the watermarks.
> + *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> + *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)
> + *
> + * cpuset check is not performed when the skip_cpuset_chk flag is set.
> + */
> +
> +static struct page *
> +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **zones, 
> +			int can_try_harder, int skip_cpuset_chk)
> +{
> +	struct zone *z;
> +	struct page *page = NULL;
> +	int classzone_idx = zone_idx(zones[0]);
> +	int i;
> +
> +	/*
> +	 * Go through the zonelist once, looking for a zone with enough free.
> +	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> +	 */
> +	for (i = 0; (z = zones[i]) != NULL; i++) {
> +		if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))

It'd be nice to not have the `skip_cpuset_chk' flag there.  a) it gives
Linus conniptions and b) it's a little extra overhead for !CONFIG_CPUSETS
kernels.

> -	zone_statistics(zonelist, z);
> +	zone_statistics(zonelist, page_zone(page));

Evaluating page_zone() is not completely trivial.  Can we avoid the above?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  1:53 ` Andrew Morton
@ 2005-11-08  2:16   ` Rohit Seth
  2005-11-08  2:44     ` Andrew Morton
  2005-11-08  3:47     ` Nick Piggin
  0 siblings, 2 replies; 19+ messages in thread
From: Rohit Seth @ 2005-11-08  2:16 UTC (permalink / raw)
  To: Andrew Morton; +Cc: torvalds, linux-mm, linux-kernel

On Mon, 2005-11-07 at 17:53 -0800, Andrew Morton wrote:
> "Rohit, Seth" <rohit.seth@intel.com> wrote:
> >
> > [PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
> > 	1- remove the initial reclaim logic
> > 	2- GFP_HIGH pages are allowed to go little below watermark sooner.
> > 	3- Search for free pages unconditionally after direct reclaim.
> 
> Would it be possible to break these into three separate patches?  The
> cleanup part should be #1.
> 

Doing the above three things as part of this clean up patch makes the
code look extra clean... Is there any specific issue coming out of 2 & 3
above.

> > +		if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))
> 
> It'd be nice to not have the `skip_cpuset_chk' flag there.  a) it gives
> Linus conniptions and b) it's a little extra overhead for !CONFIG_CPUSETS
> kernels.
> 

I think it will be easier to do this change as a follow on patch as that
will change the header file, function definition and such.  Can we defer
this to separate follow on patch.

> > -	zone_statistics(zonelist, z);
> > +	zone_statistics(zonelist, page_zone(page));
> 
> Evaluating page_zone() is not completely trivial.  Can we avoid the above?

Okay.  Last time Nick also mentioned this but agreed to keep it here.  I
will uplevel so that I don't go through the page_zone.

-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  2:16   ` Rohit Seth
@ 2005-11-08  2:44     ` Andrew Morton
  2005-11-08  3:47     ` Nick Piggin
  1 sibling, 0 replies; 19+ messages in thread
From: Andrew Morton @ 2005-11-08  2:44 UTC (permalink / raw)
  To: Rohit Seth; +Cc: torvalds, linux-mm, linux-kernel

Rohit Seth <rohit.seth@intel.com> wrote:
>
> On Mon, 2005-11-07 at 17:53 -0800, Andrew Morton wrote:
> > "Rohit, Seth" <rohit.seth@intel.com> wrote:
> > >
> > > [PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
> > > 	1- remove the initial reclaim logic
> > > 	2- GFP_HIGH pages are allowed to go little below watermark sooner.
> > > 	3- Search for free pages unconditionally after direct reclaim.
> > 
> > Would it be possible to break these into three separate patches?  The
> > cleanup part should be #1.
> > 
> 
> Doing the above three things as part of this clean up patch makes the
> code look extra clean...

With separate patches the changes can be better understood, and they can be
selectively dropped, and people looking for regressions with `git bisect'
will be able to pinpoint the source more accurately.

> Is there any specific issue coming out of 2 & 3
> above.

I haven't looked yet - all the changes are mixed together ;)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  1:43 [PATCH]: Cleanup of __alloc_pages Rohit, Seth
  2005-11-08  1:53 ` Andrew Morton
@ 2005-11-08  3:07 ` Paul Jackson
  2005-11-08  5:31   ` Nick Piggin
  2005-11-09  0:17 ` Paul Jackson
  2 siblings, 1 reply; 19+ messages in thread
From: Paul Jackson @ 2005-11-08  3:07 UTC (permalink / raw)
  To: Rohit, Seth; +Cc: akpm, torvalds, linux-mm, linux-kernel

Seth wrote:
> +/* get_page_from_freeliest loops through all the possible zones
> + * to find out if it can allocate a page.  can_try_harder can have following
> + * values:
> + * -1 => No need to check for the watermarks.
> + *  0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> + *  1 => Go far below the low watermark.  See zone_watermark_ok (RT TASK)

Argh.

These magic numbers, where in terms of how hard to try, 0 is less than
1 is less than -1, but where the order -does- matter for parsing such
tests as "if ((can_try_harder >= 0)" and where one has to read the
entire code to guess that, continue to give me conniptions.

I thought Nick had an alternative proposal, involving just boolean
flags.  Why didn't you ever consider that?


> + * cpuset check is not performed when the skip_cpuset_chk flag is set.
> + */
> +
> +static struct page *
> +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **zones, 
> +			int can_try_harder, int skip_cpuset_chk)

Well - thanks for thinking of me ;).  Though, as I suggested in my
reply last time, including a pseudo patch, I thought that the existing
flags such as can_try_harder had enough information to determine when
to do the cpuset check, without yet another flag for that.  Having now
two magic 1's and 0's at the end of the calling argument lists is even
less readable.


Seth wrote in a later message, responding to Andrew:
> I think it will be easier to do this change as a follow on patch as that
> will change the header file, function definition and such.  Can we defer
> this to separate follow on patch.

I have no clue what patch you have in mind here.  Guess I'd have to see it.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  2:16   ` Rohit Seth
  2005-11-08  2:44     ` Andrew Morton
@ 2005-11-08  3:47     ` Nick Piggin
  2005-11-08  5:44       ` Paul Jackson
  1 sibling, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2005-11-08  3:47 UTC (permalink / raw)
  To: Rohit Seth; +Cc: Andrew Morton, torvalds, linux-mm, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2075 bytes --]

Rohit Seth wrote:
> On Mon, 2005-11-07 at 17:53 -0800, Andrew Morton wrote:
> 
>>"Rohit, Seth" <rohit.seth@intel.com> wrote:
>>
>>>[PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
>>>	1- remove the initial reclaim logic
>>>	2- GFP_HIGH pages are allowed to go little below watermark sooner.
>>>	3- Search for free pages unconditionally after direct reclaim.
>>
>>Would it be possible to break these into three separate patches?  The
>>cleanup part should be #1.
>>
> 
> 
> Doing the above three things as part of this clean up patch makes the
> code look extra clean... Is there any specific issue coming out of 2 & 3
> above.
> 

#2 as I explained I don't like changing this. What this does is basically
make __GFP_HIGH allocations sometimes randomly enter direct reclaim rather
than have a nice buffer of asynch reclaim like we do now for that and all
other types of allocations.

It only reduces calls to kswapd by about as much as it will be increasing
calls to direct reclaim.

Anyway, as promised, I've attached (sorry) a patch to correct the
problems I see in it and revert previous behaviour for #2 and #3.

I'm not immediately sure of any problems with changing #3, however OOM
behaviour is pretty fragile and I wouldn't like to change it as part
of this patch.

> 
>>>+		if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))
>>
>>It'd be nice to not have the `skip_cpuset_chk' flag there.  a) it gives
>>Linus conniptions and b) it's a little extra overhead for !CONFIG_CPUSETS
>>kernels.
>>
> 

The compiler will constant fold this out if it is halfway smart.

> 
> I think it will be easier to do this change as a follow on patch as that
> will change the header file, function definition and such.  Can we defer
> this to separate follow on patch.
> 
> 
>>>-	zone_statistics(zonelist, z);
>>>+	zone_statistics(zonelist, page_zone(page));
>>
>>Evaluating page_zone() is not completely trivial.  Can we avoid the above?
> 

Done.

Let me know what you think (compile tested only, at this stage).

-- 
SUSE Labs, Novell Inc.


[-- Attachment #2: __alloc_pages-cleanup.patch --]
[-- Type: text/plain, Size: 10041 bytes --]

From: Rohit Seth <rohit.seth@intel.com>

Clean up of __alloc_pages.

Signed-off-by: Rohit Seth <rohit.seth@intel.com>

Restoration of previous behaviour, plus further cleanups
by introducing an 'alloc_flags', removing the last of
should_reclaim_zone.

Signed-off-by: Nick Piggin <npiggin@suse.de>


Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2005-11-08 14:39:56.000000000 +1100
+++ linux-2.6/mm/page_alloc.c	2005-11-08 14:42:11.000000000 +1100
@@ -707,9 +707,7 @@ buffered_rmqueue(struct zone *zone, int 
 		}
 		local_irq_restore(flags);
 		put_cpu();
-	}
-
-	if (page == NULL) {
+	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
@@ -729,20 +727,25 @@ buffered_rmqueue(struct zone *zone, int 
 	return page;
 }
 
+#define ALLOC_WATERMARKS	0x01 /* check watermarks */
+#define ALLOC_HARDER		0x02 /* try to alloc harder */
+#define ALLOC_HIGH		0x04 /* __GFP_HIGH set */
+#define ALLOC_CPUSET		0x08 /* check for correct cpuset */
+
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		      int classzone_idx, int can_try_harder, int gfp_high)
+		      int classzone_idx, int alloc_flags)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
 	int o;
 
-	if (gfp_high)
+	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
-	if (can_try_harder)
+	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -760,14 +763,41 @@ int zone_watermark_ok(struct zone *z, in
 	return 1;
 }
 
-static inline int
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
-{
-	if (!z->reclaim_pages)
-		return 0;
-	if (gfp_mask & __GFP_NORECLAIM)
-		return 0;
-	return 1;
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, int alloc_flags)
+{
+	struct zone **z = zonelist->zones;
+	struct page *page = NULL;
+	int classzone_idx = zone_idx(*z);
+
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
+	do {
+		if ((alloc_flags & ALLOC_CPUSET) &&
+				!cpuset_zone_allowed(*z, gfp_mask))
+			continue;
+
+		if (alloc_flags & ALLOC_WATERMARKS) {
+			if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+				    classzone_idx, alloc_flags))
+				continue;
+		}
+
+		page = buffered_rmqueue(*z, order, gfp_mask);
+		if (page) {
+			zone_statistics(zonelist, *z);
+			break;
+		}
+		z++;
+	} while (*z != NULL);
+	return page;
 }
 
 /*
@@ -778,70 +808,49 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 		struct zonelist *zonelist)
 {
 	const int wait = gfp_mask & __GFP_WAIT;
-	struct zone **zones, *z;
+	struct zone **z;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
-	int i;
-	int classzone_idx;
 	int do_retry;
-	int can_try_harder;
+	int alloc_flags;
 	int did_some_progress;
 
 	might_sleep_if(wait);
 
-	/*
-	 * The caller may dip into page reserves a bit more if the caller
-	 * cannot run direct reclaim, or is the caller has realtime scheduling
-	 * policy
-	 */
-	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
-
-	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
-	if (unlikely(zones[0] == NULL)) {
+	if (unlikely(*z == NULL)) {
 		/* Should this ever happen?? */
 		return NULL;
 	}
+restart:
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+				zonelist, ALLOC_WATERMARKS|ALLOC_CPUSET);
+	if (page)
+		goto got_pg;
 
-	classzone_idx = zone_idx(zones[0]);
+	do {
+		wakeup_kswapd(*z, order);
+		z++;
+	} while (*z != NULL);
 
-restart:
 	/*
-	 * Go through the zonelist once, looking for a zone with enough free.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 * OK, we're below the kswapd watermark and have kicked background
+	 * reclaim. Now things get more complex, so st up alloc_flags according
+	 * to how we want to proceed.
+	 *
+	 * The caller may dip into page reserves a bit more if the caller
+	 * cannot run direct reclaim, or is the caller has realtime scheduling
+	 * policy.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
-		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-			continue;
-
-		/*
-		 * If the zone is to attempt early page reclaim then this loop
-		 * will try to reclaim pages and check the watermark a second
-		 * time before giving up and falling back to the next zone.
-		 */
-zone_reclaim_retry:
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0)) {
-			if (!do_reclaim)
-				continue;
-			else {
-				zone_reclaim(z, gfp_mask, order);
-				/* Only try reclaim once */
-				do_reclaim = 0;
-				goto zone_reclaim_retry;
-			}
-		}
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
-
-	for (i = 0; (z = zones[i]) != NULL; i++)
-		wakeup_kswapd(z, order);
+	alloc_flags = 0;
+	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+		alloc_flags |= ALLOC_HARDER;
+	if (gfp_mask & __GFP_HIGH)
+		alloc_flags |= ALLOC_HIGH;
+	if (wait)
+		alloc_flags |= ALLOC_CPUSET;
 
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -851,19 +860,9 @@ zone_reclaim_retry:
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for (i = 0; (z = zones[i]) != NULL; i++) {
-		if (!zone_watermark_ok(z, order, z->pages_min,
-				       classzone_idx, can_try_harder,
-				       gfp_mask & __GFP_HIGH))
-			continue;
-
-		if (wait && !cpuset_zone_allowed(z, gfp_mask))
-			continue;
-
-		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
-			goto got_pg;
-	}
+	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+	if (page)
+		goto got_pg;
 
 	/* This allocation should allow future memory freeing. */
 
@@ -871,13 +870,10 @@ zone_reclaim_retry:
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
-			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z, gfp_mask))
-					continue;
-				page = buffered_rmqueue(z, order, gfp_mask);
-				if (page)
-					goto got_pg;
-			}
+			page = get_page_from_freelist(gfp_mask, order,
+						zonelist, ALLOC_CPUSET);
+			if (page)
+				goto got_pg;
 		}
 		goto nopage;
 	}
@@ -894,7 +890,7 @@ rebalance:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zones, gfp_mask);
+	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
@@ -902,19 +898,10 @@ rebalance:
 	cond_resched();
 
 	if (likely(did_some_progress)) {
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_min,
-					       classzone_idx, can_try_harder,
-					       gfp_mask & __GFP_HIGH))
-				continue;
-
-			if (!cpuset_zone_allowed(z, gfp_mask))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
+		page = get_page_from_freelist(gfp_mask, order,
+						zonelist, alloc_flags);
+		if (page)
+			goto got_pg;
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		/*
 		 * Go through the zonelist yet one more time, keep
@@ -922,18 +909,10 @@ rebalance:
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
-		for (i = 0; (z = zones[i]) != NULL; i++) {
-			if (!zone_watermark_ok(z, order, z->pages_high,
-					       classzone_idx, 0, 0))
-				continue;
-
-			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-				continue;
-
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page)
-				goto got_pg;
-		}
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+				zonelist, ALLOC_WATERMARKS|ALLOC_CPUSET);
+		if (page)
+			goto got_pg;
 
 		out_of_memory(gfp_mask, order);
 		goto restart;
@@ -966,9 +945,7 @@ nopage:
 		dump_stack();
 		show_mem();
 	}
-	return NULL;
 got_pg:
-	zone_statistics(zonelist, z);
 	return page;
 }
 
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2005-11-08 14:39:56.000000000 +1100
+++ linux-2.6/include/linux/mmzone.h	2005-11-08 14:41:26.000000000 +1100
@@ -302,7 +302,7 @@ void get_zone_counts(unsigned long *acti
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-		int alloc_type, int can_try_harder, int gfp_high);
+		int classzone_idx, int alloc_flags);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2005-11-08 14:39:56.000000000 +1100
+++ linux-2.6/mm/vmscan.c	2005-11-08 14:41:26.000000000 +1100
@@ -1069,7 +1069,7 @@ loop_again:
 
 			if (nr_pages == 0) {	/* Not software suspend */
 				if (zone_watermark_ok(zone, order,
-					zone->pages_high, first_low_zone, 0, 0))
+					zone->pages_high, first_low_zone, 0))
 					continue;
 
 				all_zones_ok = 0;
@@ -1222,7 +1222,7 @@ void wakeup_kswapd(struct zone *zone, in
 		return;
 
 	pgdat = zone->zone_pgdat;
-	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  3:07 ` Paul Jackson
@ 2005-11-08  5:31   ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2005-11-08  5:31 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Rohit, Seth, akpm, torvalds, linux-mm, linux-kernel

Paul Jackson wrote:

> I thought Nick had an alternative proposal, involving just boolean

Hi Paul,

I sent the cleanup patch plus some modifications to lkml in a
subthread but forgot to CC you on it.

Let me know if you have any comments or suggestions on it.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  3:47     ` Nick Piggin
@ 2005-11-08  5:44       ` Paul Jackson
  2005-11-08  6:00         ` Nick Piggin
  0 siblings, 1 reply; 19+ messages in thread
From: Paul Jackson @ 2005-11-08  5:44 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, torvalds, linux-mm, linux-kernel

Nick wrote:
> The compiler will constant fold this out if it is halfway smart.

How could that happen - when get_page_from_freelist() is called twice,
once with skip_cpuset_chk == 0 and once with skip_cpuset_chk == 1?


> +#define ALLOC_WATERMARKS	0x01 /* check watermarks */
> +#define ALLOC_HARDER		0x02 /* try to alloc harder */
> +#define ALLOC_HIGH		0x04 /* __GFP_HIGH set */
> +#define ALLOC_CPUSET		0x08 /* check for correct cpuset */

Names - bless you.

If these names were in a header, then calls to zone_watermark_ok()
from mm/vmscan.c could use them too?


> +	 * reclaim. Now things get more complex, so st up alloc_flags according

Typo: s/st/set/


At first glance, I think you've expressed the cpuset flags correctly.
Well, correctly maintained their current meaning.  Read on, and you
will see that I think that is not right.

I'm just reading the raw patch, so likely I missed something here.
But it seems to me that zone_watermark_ok() is called from __alloc_pages()
only if the ALLOC_WATERMARKS flag is set, and it seems that the two
alloc_flags values ALLOC_HARDER and ALLOC_HIGH are only of use if
zone_watermark() is called.  So what use is it setting ALLOC_HARDER
or ALLOC_HIGH if ALLOC_WATERMARKS is not set?  If the get_page_from_freelist()
check:
	if (alloc_flags & ALLOC_WATERMARKS)
was instead:
	if (alloc_flags & ALLOC_WATERMARKS|ALLOC_HARDER|ALLOC_HIGH)
then this would make more sense to me.  Or changing ALLOC_WATERMARKS
to ALLOC_EASY, and make it behave similarly to the HARDER & HIGH flags.
Or maybe if the initialization of alloc_flags:
> +	alloc_flags = 0;
was instead:
  +	alloc_flags = ALLOC_WATERMARKS;

As a possible future change, I wonder if there is a way to avoid having
the ALLOC_CPUSET flag separate, and instead collapse the logic (even if
it means modifying the conditions a little when cpusets are honored) to
make the cpuset_zone_allowed() check based on some combination of these
other WATERMARKS/HARDER/HIGH values.  For example, it might make sense
to check cpuset_zone_allowed() under the same conditions that it made
sense to call zone_watermark_ok() from get_page_from_freelist(), or
perhaps to check cpusets unless we are HIGH or not checking
watermarks.  To the best of my knowledge, subtle variations between
when we check some level of watermarks and when we check cpusets are
not worth it - not worth the extra conditional jump in the machine
code, and not worth the extra bit of logic and flags in the source code.

The cpuset check in the 'ignoring mins' code shortly after this for the
PF_MEMALLOC or TIF_MEMDIE cases seems bogus.  This is the case where we
should be most willing to use memory, regardless of where we find it.
That cpuset check should be removed.

My current inclination - check cpusets in the WATERMARKS or HARDER
or (HIGH && wait) cases, but ignore cpusets in the (HIGH && !wait) or
'ignoring mins' cases.  Can "HIGH && wait" even happen ??  Are
allocations either GFP_ATOMIC (aka GFP_HIGH) or (exclusive or)
GFP_WAIT, never both?  Perhaps GFP_HIGH should be permanently
deleted (another cleanup) in favor of the more popular and expressive
GFP_ATOMIC, and __GFP_WAIT retired, in favor of !GFP_ATOMIC.

However, I appreciate your preference to separate cleanup from semantic
change.  Perhaps this means leaving the ALLOC_CPUSET flag in your
cleanup patch, then one of us following on top of that with a patch to
simplify and fix the cpuset invocation semantics and a second cleanup
patch to remove ALLOC_CPUSET as a separate flag.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  5:44       ` Paul Jackson
@ 2005-11-08  6:00         ` Nick Piggin
  2005-11-08  6:22           ` Paul Jackson
  2005-11-08 18:17           ` Rohit Seth
  0 siblings, 2 replies; 19+ messages in thread
From: Nick Piggin @ 2005-11-08  6:00 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rohit.seth, akpm, torvalds, linux-mm, linux-kernel

Paul Jackson wrote:
> Nick wrote:
> 
>>The compiler will constant fold this out if it is halfway smart.
> 
> 
> How could that happen - when get_page_from_freelist() is called twice,
> once with skip_cpuset_chk == 0 and once with skip_cpuset_chk == 1?
> 

Because it is on the other side of an &&, which evaulates to a
constant zero when !CONFIG_CPUSETS.

> 
> 
>>+#define ALLOC_WATERMARKS	0x01 /* check watermarks */
>>+#define ALLOC_HARDER		0x02 /* try to alloc harder */
>>+#define ALLOC_HIGH		0x04 /* __GFP_HIGH set */
>>+#define ALLOC_CPUSET		0x08 /* check for correct cpuset */
> 
> 
> Names - bless you.
> 
> If these names were in a header, then calls to zone_watermark_ok()
> from mm/vmscan.c could use them too?
> 
> 
> 
>>+	 * reclaim. Now things get more complex, so st up alloc_flags according
> 
> 
> Typo: s/st/set/
> 
> 
> At first glance, I think you've expressed the cpuset flags correctly.
> Well, correctly maintained their current meaning.  Read on, and you
> will see that I think that is not right.
> 
> I'm just reading the raw patch, so likely I missed something here.
> But it seems to me that zone_watermark_ok() is called from __alloc_pages()
> only if the ALLOC_WATERMARKS flag is set, and it seems that the two
> alloc_flags values ALLOC_HARDER and ALLOC_HIGH are only of use if
> zone_watermark() is called.  So what use is it setting ALLOC_HARDER
> or ALLOC_HIGH if ALLOC_WATERMARKS is not set?  If the get_page_from_freelist()
> check:
> 	if (alloc_flags & ALLOC_WATERMARKS)
> was instead:
> 	if (alloc_flags & ALLOC_WATERMARKS|ALLOC_HARDER|ALLOC_HIGH)
> then this would make more sense to me.  Or changing ALLOC_WATERMARKS
> to ALLOC_EASY, and make it behave similarly to the HARDER & HIGH flags.
> Or maybe if the initialization of alloc_flags:
> 
>>+	alloc_flags = 0;
> 
> was instead:
>   +	alloc_flags = ALLOC_WATERMARKS;
> 

Yep that's a bug. Thanks. Maybe instead we should have a specific
flag for ALLOC_NO_WATERMARKS because that is the unusual case. The
use of the flag there would be a good annotation too.



> The cpuset check in the 'ignoring mins' code shortly after this for the
> PF_MEMALLOC or TIF_MEMDIE cases seems bogus.  This is the case where we
> should be most willing to use memory, regardless of where we find it.
> That cpuset check should be removed.
> 

OK that would be fine, but let's do that (and your suggested possible
consolidation of ALLOC_CPUSET) in another patch?

> My current inclination - check cpusets in the WATERMARKS or HARDER
> or (HIGH && wait) cases, but ignore cpusets in the (HIGH && !wait) or
> 'ignoring mins' cases.  Can "HIGH && wait" even happen ??  Are

Yes there is nothing preventing it.

> allocations either GFP_ATOMIC (aka GFP_HIGH) or (exclusive or)
> GFP_WAIT, never both?  Perhaps GFP_HIGH should be permanently
> deleted (another cleanup) in favor of the more popular and expressive
> GFP_ATOMIC, and __GFP_WAIT retired, in favor of !GFP_ATOMIC.
> 

Having __GFP_HIGH as its own flag gives some more flexibility. I
don't think it has a downside?

> However, I appreciate your preference to separate cleanup from semantic
> change.  Perhaps this means leaving the ALLOC_CPUSET flag in your
> cleanup patch, then one of us following on top of that with a patch to
> simplify and fix the cpuset invocation semantics and a second cleanup
> patch to remove ALLOC_CPUSET as a separate flag.
> 

That would be good. I'll send off a fresh patch with the
ALLOC_WATERMARKS fixed after Rohit gets around to looking over
it.

-- 
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  6:00         ` Nick Piggin
@ 2005-11-08  6:22           ` Paul Jackson
  2005-11-08 18:17           ` Rohit Seth
  1 sibling, 0 replies; 19+ messages in thread
From: Paul Jackson @ 2005-11-08  6:22 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, torvalds, linux-mm, linux-kernel

Nick wrote:
> Because it is on the other side of an &&, which evaulates to a
> constant zero when !CONFIG_CPUSETS.

Ah so.

> Having __GFP_HIGH as its own flag gives some more flexibility. I
> don't think it has a downside?

With respect to GFP_ATOMIC, __GFP_HIGH has no flexibility, as they are
#defined to be the same thing.

With respect to __GFP_WAIT, if we only ever use it exactly when
we don't use __GFP_HIGH aka GFP_ATOMIC, then there is a definite
downside.  My old brain doesn't fold constants nearly as reliably or
rapidly as a compiler.  Every apparent degree of freedom that is unused
wastes a few of my remaining precious neurons understanding it.
It directly leads to such bugs as the one I noted in my last reply,
when I realized that checking cpusets in the 'ignoring mins' case
was bogus.

__GFP_HIGH has a second cost - it is easily confused with __GFP_HIGHMEM.

> That would be good. I'll send off a fresh patch with the
> ALLOC_WATERMARKS fixed after Rohit gets around to looking over
> it.

Good.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  6:00         ` Nick Piggin
  2005-11-08  6:22           ` Paul Jackson
@ 2005-11-08 18:17           ` Rohit Seth
  2005-11-08 19:54             ` Paul Jackson
  2005-11-09  2:52             ` Nick Piggin
  1 sibling, 2 replies; 19+ messages in thread
From: Rohit Seth @ 2005-11-08 18:17 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Paul Jackson, akpm, torvalds, linux-mm, linux-kernel

On Tue, 2005-11-08 at 17:00 +1100, Nick Piggin wrote:

> 
> > However, I appreciate your preference to separate cleanup from semantic
> > change.  Perhaps this means leaving the ALLOC_CPUSET flag in your
> > cleanup patch, then one of us following on top of that with a patch to
> > simplify and fix the cpuset invocation semantics and a second cleanup
> > patch to remove ALLOC_CPUSET as a separate flag.
> > 
> 
> That would be good. I'll send off a fresh patch with the
> ALLOC_WATERMARKS fixed after Rohit gets around to looking over
> it.
> 

Nick, your changes have really come out good.  Thanks.  I think it is
definitely a good starting point as it maintains all of existing
behavior.

I guess now I can argue about why we should keep the watermark low for
GFP_HIGH ;-)

Paul, sorry for troubling you with those magic numbers again in the
original patch...

-rohit

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08 18:17           ` Rohit Seth
@ 2005-11-08 19:54             ` Paul Jackson
  2005-11-09  2:52             ` Nick Piggin
  1 sibling, 0 replies; 19+ messages in thread
From: Paul Jackson @ 2005-11-08 19:54 UTC (permalink / raw)
  To: Rohit Seth; +Cc: nickpiggin, akpm, torvalds, linux-mm, linux-kernel

Rohit wrote:
> Paul, sorry for troubling you with those magic numbers again in the
> original patch...

That's no problem.  I enjoyed the opportunity to protest it again ;).

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08  1:43 [PATCH]: Cleanup of __alloc_pages Rohit, Seth
  2005-11-08  1:53 ` Andrew Morton
  2005-11-08  3:07 ` Paul Jackson
@ 2005-11-09  0:17 ` Paul Jackson
  2 siblings, 0 replies; 19+ messages in thread
From: Paul Jackson @ 2005-11-09  0:17 UTC (permalink / raw)
  To: Rohit, Seth; +Cc: akpm, torvalds, linux-mm, linux-kernel

If you're going to remove the early reclaim logic, then
lets also nuke the related apparatus: should_reclaim_zone()
and __GFP_NORECLAIM (which is used in a couple of pagemap.h
macros as well)?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-08 18:17           ` Rohit Seth
  2005-11-08 19:54             ` Paul Jackson
@ 2005-11-09  2:52             ` Nick Piggin
  2005-11-13  5:09               ` Paul Jackson
  1 sibling, 1 reply; 19+ messages in thread
From: Nick Piggin @ 2005-11-09  2:52 UTC (permalink / raw)
  To: Rohit Seth; +Cc: Paul Jackson, akpm, torvalds, linux-mm, linux-kernel

Rohit Seth wrote:

>On Tue, 2005-11-08 at 17:00 +1100, Nick Piggin wrote:
>
>
>>That would be good. I'll send off a fresh patch with the
>>ALLOC_WATERMARKS fixed after Rohit gets around to looking over
>>it.
>>
>>
>
>Nick, your changes have really come out good.  Thanks.  I think it is
>definitely a good starting point as it maintains all of existing
>behavior.
>
>

Great, glad you agree. I'll send the revised copy upstream.

>I guess now I can argue about why we should keep the watermark low for
>GFP_HIGH ;-)
>
>

Yep, I would be happy to discuss this with you and linux-mm :)


Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-09  2:52             ` Nick Piggin
@ 2005-11-13  5:09               ` Paul Jackson
  2005-11-13  5:14                 ` Paul Jackson
  0 siblings, 1 reply; 19+ messages in thread
From: Paul Jackson @ 2005-11-13  5:09 UTC (permalink / raw)
  To: Nick Piggin; +Cc: rohit.seth, akpm, torvalds, linux-mm, linux-kernel

The __GFP_HIGH, GFP_ATOMIC, __GFP_WAIT flags are still driving me bonkers.

It seems to me that:
 1) __GFP_WAIT is supposed to mean can wait, and __alloc_pages()
    keys off that bit to set its "wait" variable.  Good so far.
 2) __GFP_HIGH is supposed to mean can access emergency pools
    (use lower watermarks), and __alloc_pages() does that.  Also
    good so far.
 3) But gfp.h defines GFP_ATOMIC to be an alias for __GFP_HIGH,
    and many callers through out the kernel use GFP_ATOMIC to mean
    "can't sleep" or "can't wait" or some such.  These folks are
    not getting the service they expect - they are asking for the
    most aggressive form of allocation (short perhaps of the
    special case for allocations that will net free more memory
    than they require, such as exiting), and they get the half way
    improvement instead, with the possibility of sleeping (!).

The confusion even extends to the comments in __alloc_pages(),
such as in the lines:

	/* Atomic allocations - we can't balance anything */
	if (!wait)
		goto nopage;

The "!wait" condition is --not-- GFP_ATOMIC, which is what
one might think was meant by "Atomic allocations", and likely
what the many users of GFP_ATOMIC were expecting - a nopage
response in such cases.

Perhaps GFP_ATOMIC should be its own __GFP_ATOMIC bit, with a BUG_ON
if both __GFP_ATOMIC and __GFP_WAIT are set at the same time,
leaving __GFP_HIGH for the few uses where people were just asking
to go a bit lower in the reserves.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-13  5:09               ` Paul Jackson
@ 2005-11-13  5:14                 ` Paul Jackson
  2005-11-13  7:00                   ` Nathan Scott
  2005-11-13  7:12                   ` Andrew Morton
  0 siblings, 2 replies; 19+ messages in thread
From: Paul Jackson @ 2005-11-13  5:14 UTC (permalink / raw)
  To: Paul Jackson
  Cc: nickpiggin, rohit.seth, akpm, torvalds, linux-mm, linux-kernel

An even stranger line:

fs/xfs/linux-2.6/xfs_buf.c has:
    aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);

Given the gfp.h line:
    #define GFP_ATOMIC  (__GFP_VALID | __GFP_HIGH)

that xfs_buf line makes no sense.  There is almost no chance
that the author of that xfs_buf.c line was aware they were
spelling the empty gfp flag __GFP_VALID.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-13  5:14                 ` Paul Jackson
@ 2005-11-13  7:00                   ` Nathan Scott
  2005-11-13  7:12                   ` Andrew Morton
  1 sibling, 0 replies; 19+ messages in thread
From: Nathan Scott @ 2005-11-13  7:00 UTC (permalink / raw)
  To: Paul Jackson
  Cc: nickpiggin, rohit.seth, akpm, torvalds, linux-mm, linux-kernel

On Sat, Nov 12, 2005 at 09:14:29PM -0800, Paul Jackson wrote:
> An even stranger line:
> 
> fs/xfs/linux-2.6/xfs_buf.c has:
>     aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
> 
> Given the gfp.h line:
>     #define GFP_ATOMIC  (__GFP_VALID | __GFP_HIGH)
> 
> that xfs_buf line makes no sense.  There is almost no chance
> that the author of that xfs_buf.c line was aware they were
> spelling the empty gfp flag __GFP_VALID.

Actually, there is a very good chance of that - it was
Andrew's recommendation a few months back... ;)

cheers.

-- 
Nathan

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-13  5:14                 ` Paul Jackson
  2005-11-13  7:00                   ` Nathan Scott
@ 2005-11-13  7:12                   ` Andrew Morton
  2005-11-13  7:47                     ` Paul Jackson
  1 sibling, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2005-11-13  7:12 UTC (permalink / raw)
  To: Paul Jackson; +Cc: nickpiggin, rohit.seth, torvalds, linux-mm, linux-kernel

Paul Jackson <pj@sgi.com> wrote:
>
>  fs/xfs/linux-2.6/xfs_buf.c has:
>      aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);

That's a reasonable thing to do, actually.

GFP_ATOMIC means "don't sleep" (!__GFP_WAIT) and "use emergency pools"
(__GFP_HIGH).

XFS is saying "don't sleep" and "don't use the emergency pools".

Yes, the fact that GFP_ATOMIC also implies "use the emergency pool" is
unfortunate, and perhaps the two should always have been separated out, at
least to make the programmer think about whether the code really needs
access to the emergency pools.   Usually it does.

But I haven't seen much sign that it's causing any problems.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH]: Cleanup of __alloc_pages
  2005-11-13  7:12                   ` Andrew Morton
@ 2005-11-13  7:47                     ` Paul Jackson
  0 siblings, 0 replies; 19+ messages in thread
From: Paul Jackson @ 2005-11-13  7:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nickpiggin, rohit.seth, torvalds, linux-mm, linux-kernel

> Yes, the fact that GFP_ATOMIC also implies "use the emergency pool" is
> unfortunate, and perhaps the two should always have been separated out, at
> least to make the programmer think about whether the code really needs
> access to the emergency pools.   Usually it does.

Ah - now it makes more sense.

The key invisible fact in the gfp.h line:

  #define GFP_ATOMIC      (__GFP_VALID | __GFP_HIGH)

is that __GFP_WAIT is *not* set (making it mean don't sleep).
All the other commonly used GFP_* flags do have __GFP_WAIT.

I have no issue with ATOMIC also meaning "use emergency pool".
That's an appropriate simplication, that fits the usage well.

I just had a mental block on the invisible unset __GFP_WAIT bit.

Would you look kindly on a patch that did:


--- 2.6.14-mm2.orig/include/linux/gfp.h	2005-11-12 23:36:57.258103418 -0800
+++ 2.6.14-mm2/include/linux/gfp.h	2005-11-12 23:42:35.287219455 -0800
@@ -58,6 +58,7 @@ struct vm_area_struct;
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
 			__GFP_NOMEMALLOC|__GFP_HARDWALL)
 
+/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
 #define GFP_ATOMIC	(__GFP_VALID | __GFP_HIGH)
 #define GFP_NOIO	(__GFP_VALID | __GFP_WAIT)
 #define GFP_NOFS	(__GFP_VALID | __GFP_WAIT | __GFP_IO)


-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2005-11-13  7:47 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-11-08  1:43 [PATCH]: Cleanup of __alloc_pages Rohit, Seth
2005-11-08  1:53 ` Andrew Morton
2005-11-08  2:16   ` Rohit Seth
2005-11-08  2:44     ` Andrew Morton
2005-11-08  3:47     ` Nick Piggin
2005-11-08  5:44       ` Paul Jackson
2005-11-08  6:00         ` Nick Piggin
2005-11-08  6:22           ` Paul Jackson
2005-11-08 18:17           ` Rohit Seth
2005-11-08 19:54             ` Paul Jackson
2005-11-09  2:52             ` Nick Piggin
2005-11-13  5:09               ` Paul Jackson
2005-11-13  5:14                 ` Paul Jackson
2005-11-13  7:00                   ` Nathan Scott
2005-11-13  7:12                   ` Andrew Morton
2005-11-13  7:47                     ` Paul Jackson
2005-11-08  3:07 ` Paul Jackson
2005-11-08  5:31   ` Nick Piggin
2005-11-09  0:17 ` Paul Jackson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox