[patch 1/2] mm: detect bad zones

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch 1/2] mm: detect bad zones
@ 2006-05-21  8:22 Nick Piggin
  2006-05-21  8:22 ` [patch 2/2] mm: handle unaligned zones Nick Piggin
  0 siblings, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2006-05-21  8:22 UTC (permalink / raw)
  To: Andrew Morton, Andy Whitcroft, Mel Gorman, stable,
	Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 412 bytes --]

Hi,

I think the previous few patches / patchsets to handle the unaligned zone
thing aren't exactly what we want (at least, for 2.6.16.stable and 2.6.17).

Firstly, we need to check for buddies outside the zone span, not just those
which are in a different zone.

Secondly, I think aligned zones should be an opt-in thing. Performance hit
is not huge, but potential stability hit is.

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: mm-detect-bad-zones.patch --]
[-- Type: text/plain, Size: 1982 bytes --]

panic when zones fail correct alignment and other checks.
The alternative could be random and/or undetected corruption later.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2006-05-19 13:15:51.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2006-05-21 12:22:44.000000000 +1000
@@ -2041,6 +2041,47 @@ static __meminit void zone_pcp_init(stru
 			zone->name, zone->present_pages, batch);
 }
 
+static __meminit void zone_debug_checks(struct zone *zone)
+{
+	unsigned long pfn;
+	unsigned long start = zone->zone_start_pfn;
+	unsigned long end = start + zone->spanned_pages;
+	const unsigned long mask = ((1<<MAX_ORDER)-1);
+	
+	if (start & mask)
+		panic("zone start pfn (%lx) not MAX_ORDER aligned\n", start);
+
+	if (end & mask)
+		panic("zone end pfn (%lx) not MAX_ORDER aligned\n", end);
+
+	for (pfn = start; pfn < end; pfn++) {
+		struct page *page;
+		int order;
+
+#ifndef CONFIG_HOLES_IN_ZONE
+		if (!pfn_valid(pfn))
+			panic("zone pfn (%lx) not valid\n", pfn);
+#endif
+
+		page = pfn_to_page(pfn);
+		if (page_zone(page) != zone)
+			panic("zone page (pfn %lx) in wrong zone\n", pfn);
+
+		for (order = 0; order < MAX_ORDER-1; order++) {
+			struct page *buddy;
+			buddy = __page_find_buddy(page, pfn & mask, order);
+
+#ifndef CONFIG_HOLES_IN_ZONE
+			if (!pfn_valid(page_to_pfn(buddy)))
+				panic("pfn (%lx) buddy (order %d) not valid\n", pfn, order);
+#endif
+
+			if (page_zone(buddy) != zone)
+				panic("pfn (%lx) buddy (order %d) in wrong zone\n", pfn, order);
+		}
+	}
+}
+
 static __meminit void init_currently_empty_zone(struct zone *zone,
 		unsigned long zone_start_pfn, unsigned long size)
 {
@@ -2054,6 +2095,8 @@ static __meminit void init_currently_emp
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 
 	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+	zone_debug_checks(zone);
 }
 
 /*

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 2/2] mm: handle unaligned zones
  2006-05-21  8:22 [patch 1/2] mm: detect bad zones Nick Piggin
@ 2006-05-21  8:22 ` Nick Piggin
  2006-05-21  9:19   ` Andrew Morton
  2006-05-22  8:18   ` Andy Whitcroft
  0 siblings, 2 replies; 23+ messages in thread
From: Nick Piggin @ 2006-05-21  8:22 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Andy Whitcroft, Mel Gorman, stable,
	Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 32 bytes --]

2/2

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: mm-unaligned-zones.patch --]
[-- Type: text/plain, Size: 7829 bytes --]

Allow unaligned zones, and make this an opt-in CONFIG_ option because
some architectures appear to be relying on unaligned zones being handled
correctly.

- Also, the bad_range checks are removed, they are checked at meminit time
  since the last patch.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2006-05-21 17:53:36.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2006-05-21 18:20:13.000000000 +1000
@@ -85,55 +85,6 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-#ifdef CONFIG_DEBUG_VM
-static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
-{
-	int ret = 0;
-	unsigned seq;
-	unsigned long pfn = page_to_pfn(page);
-
-	do {
-		seq = zone_span_seqbegin(zone);
-		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
-			ret = 1;
-		else if (pfn < zone->zone_start_pfn)
-			ret = 1;
-	} while (zone_span_seqretry(zone, seq));
-
-	return ret;
-}
-
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
-#ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
-		return 0;
-#endif
-	if (zone != page_zone(page))
-		return 0;
-
-	return 1;
-}
-/*
- * Temporary debugging check for pages not lying within a given zone.
- */
-static int bad_range(struct zone *zone, struct page *page)
-{
-	if (page_outside_zone_boundaries(zone, page))
-		return 1;
-	if (!page_is_consistent(zone, page))
-		return 1;
-
-	return 0;
-}
-
-#else
-static inline int bad_range(struct zone *zone, struct page *page)
-{
-	return 0;
-}
-#endif
-
 static void bad_page(struct page *page)
 {
 	printk(KERN_EMERG "Bad page state in process '%s'\n"
@@ -281,9 +232,86 @@ __find_combined_index(unsigned long page
 }
 
 /*
- * This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * If the mem_map may have holes (invalid pfns) in it, which are not on
+ * MAX_ORDER<<1 aligned boundaries, CONFIG_HOLES_IN_ZONE must be set by the
+ * architecture, because the buddy allocator will otherwise attempt to access
+ * their underlying struct page when finding a buddy to merge.
+ */
+static inline int page_in_zone_hole(struct page *page)
+{
+#ifdef CONFIG_HOLES_IN_ZONE
+	/*
+	 *
+	 */
+	if (!pfn_valid(page_to_pfn(page)))
+		return 1;
+#endif
+	return 0;
+}
+
+/*
+ * If the the zone's mem_map is not 1<<MAX_ORDER aligned, CONFIG_ALIGNED_ZONE
+ * must *not* be set by the architecture, because the buddy allocator will run
+ * into "buddies" which are outside mem_map.
+ *
+ * It is not enough for the node's mem_map to be aligned, because unaligned
+ * zone boundaries can cause a buddies to be in different zones.
+ */
+static inline int buddy_outside_zone_span(struct page *page, struct page *buddy)
+{
+	int ret = 0;
+
+#ifndef CONFIG_ALIGNED_ZONE
+	unsigned int seq;
+	unsigned long pfn;
+	struct zone *zone;
+
+	pfn = page_to_pfn(page);
+	zone = page_zone(page);
+
+	do {
+
+		seq = zone_span_seqbegin(zone);
+		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
+			ret = 1;
+		else if (pfn < zone->zone_start_pfn)
+			ret = 1;
+	} while (zone_span_seqretry(zone, seq));
+	if (ret)
+		goto out;
+
+	/*
+	 * page_zone_idx accesses page->flags, so this test must go after
+	 * the above, which ensures that buddy is within the zone.
+	 */
+	if (page_zone_idx(page) != page_zone_idx(buddy))
+		ret = 1;
+
+out:
+#endif
+
+	return ret;
+}
+
+/*
+ * In some memory configurations, buddy pages may be found which are
+ * outside the zone pages. Check for those here.
+ */
+static int buddy_outside_zone(struct page *page, struct page *buddy)
+{
+	if (page_in_zone_hole(buddy))
+		return 1;
+
+	if (buddy_outside_zone_span(page, buddy))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * This function checks whether a buddy is free and is the buddy of page.
+ * We can coalesce a page and its buddy if
+ * (a) the buddy is not "outside" the zone &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order.
  *
@@ -292,15 +320,13 @@ __find_combined_index(unsigned long page
  *
  * For recording page's order, we use page_private(page).
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy, int order)
 {
-#ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
+	if (buddy_outside_zone(page, buddy))
 		return 0;
-#endif
 
-	if (PageBuddy(page) && page_order(page) == order) {
-		BUG_ON(page_count(page) != 0);
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
@@ -342,7 +368,6 @@ static inline void __free_one_page(struc
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
 	BUG_ON(page_idx & (order_size - 1));
-	BUG_ON(bad_range(zone, page));
 
 	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
@@ -351,7 +376,7 @@ static inline void __free_one_page(struc
 		struct page *buddy;
 
 		buddy = __page_find_buddy(page, page_idx, order);
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 
 		list_del(&buddy->lru);
@@ -506,7 +531,6 @@ static inline void expand(struct zone *z
 		area--;
 		high--;
 		size >>= 1;
-		BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list);
 		area->nr_free++;
 		set_page_order(&page[size], high);
@@ -824,7 +848,6 @@ again:
 	local_irq_restore(flags);
 	put_cpu();
 
-	BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
@@ -2048,11 +2071,13 @@ static __meminit void zone_debug_checks(
 	unsigned long end = start + zone->spanned_pages;
 	const unsigned long mask = ((1<<MAX_ORDER)-1);
 	
+#ifdef CONFIG_ALIGNED_ZONE
 	if (start & mask)
 		panic("zone start pfn (%lx) not MAX_ORDER aligned\n", start);
 
 	if (end & mask)
 		panic("zone end pfn (%lx) not MAX_ORDER aligned\n", end);
+#endif
 
 	for (pfn = start; pfn < end; pfn++) {
 		struct page *page;
@@ -2068,16 +2093,23 @@ static __meminit void zone_debug_checks(
 			panic("zone page (pfn %lx) in wrong zone\n", pfn);
 
 		for (order = 0; order < MAX_ORDER-1; order++) {
+			unsigned long buddy_pfn;
 			struct page *buddy;
 			buddy = __page_find_buddy(page, pfn & mask, order);
+			buddy_pfn = page_to_pfn(buddy);
 
 #ifndef CONFIG_HOLES_IN_ZONE
-			if (!pfn_valid(page_to_pfn(buddy)))
+			if (!pfn_valid(buddy_pfn))
 				panic("pfn (%lx) buddy (order %d) not valid\n", pfn, order);
 #endif
 
-			if (page_zone(buddy) != zone)
-				panic("pfn (%lx) buddy (order %d) in wrong zone\n", pfn, order);
+#ifdef CONFIG_ALIGNED_ZONE
+			if (buddy_pfn < start || buddy_pfn >= end)
+				panic("pfn (%lx) buddy (%lx) (order %d) outside zone\n", pfn, buddy_pfn, order);
+
+			if (zone != page_zone(buddy))
+				panic("pfn (%lx) buddy (%lx) (order %d) in different zone\n", pfn, buddy_pfn, order);
+#endif
 		}
 	}
 }
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-19 12:48:01.000000000 +1000
+++ linux-2.6/include/linux/mm.h	2006-05-21 18:10:23.000000000 +1000
@@ -466,10 +466,14 @@ static inline unsigned long page_zonenum
 struct zone;
 extern struct zone *zone_table[];
 
+static inline unsigned long page_zone_idx(struct page *page)
+{
+	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+}
+
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
-			ZONETABLE_MASK];
+	return zone_table[page_zone_idx(page)];
 }
 
 static inline unsigned long page_to_nid(struct page *page)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21  8:22 ` [patch 2/2] mm: handle unaligned zones Nick Piggin
@ 2006-05-21  9:19   ` Andrew Morton
  2006-05-21 10:31     ` Nick Piggin
  2006-05-22  8:18   ` Andy Whitcroft
  1 sibling, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2006-05-21  9:19 UTC (permalink / raw)
  To: Nick Piggin; +Cc: apw, mel, stable, linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> Allow unaligned zones, and make this an opt-in CONFIG_ option because
> some architectures appear to be relying on unaligned zones being handled
> correctly.
> 
> - Also, the bad_range checks are removed, they are checked at meminit time
>   since the last patch.
> 
> ...
> 
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c	2006-05-21 17:53:36.000000000 +1000
> +++ linux-2.6/mm/page_alloc.c	2006-05-21 18:20:13.000000000 +1000
>
> ...
>
> +{
> +#ifdef CONFIG_HOLES_IN_ZONE

(Why is this a config option?  If we can optionally handle it, why not
always just handle it?

> +/*
> + * If the the zone's mem_map is not 1<<MAX_ORDER aligned, CONFIG_ALIGNED_ZONE
> + * must *not* be set by the architecture, because the buddy allocator will run
> + * into "buddies" which are outside mem_map.
> + *
> + * It is not enough for the node's mem_map to be aligned, because unaligned
> + * zone boundaries can cause a buddies to be in different zones.
> + */
> +static inline int buddy_outside_zone_span(struct page *page, struct page *buddy)
> +{
> +	int ret = 0;
> +
> +#ifndef CONFIG_ALIGNED_ZONE
> +	unsigned int seq;
> +	unsigned long pfn;
> +	struct zone *zone;
> +
> +	pfn = page_to_pfn(page);
> +	zone = page_zone(page);
> +
> +	do {

You'll want a `ret = 0' here.

> +		seq = zone_span_seqbegin(zone);
> +		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
> +			ret = 1;
> +		else if (pfn < zone->zone_start_pfn)
> +			ret = 1;
> +	} while (zone_span_seqretry(zone, seq));
> +	if (ret)
> +		goto out;
> +
> +	/*
> +	 * page_zone_idx accesses page->flags, so this test must go after
> +	 * the above, which ensures that buddy is within the zone.
> +	 */
> +	if (page_zone_idx(page) != page_zone_idx(buddy))
> +		ret = 1;
> +
> +out:
> +#endif
> +
> +	return ret;
> +}
> +
> +/*
> + * In some memory configurations, buddy pages may be found which are
> + * outside the zone pages. Check for those here.
> + */
> +static int buddy_outside_zone(struct page *page, struct page *buddy)
> +{
> +	if (page_in_zone_hole(buddy))
> +		return 1;
> +
> +	if (buddy_outside_zone_span(page, buddy))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * This function checks whether a buddy is free and is the buddy of page.
> + * We can coalesce a page and its buddy if
> + * (a) the buddy is not "outside" the zone &&
>   * (b) the buddy is in the buddy system &&
>   * (c) a page and its buddy have the same order.
>   *
> @@ -292,15 +320,13 @@ __find_combined_index(unsigned long page
>   *
>   * For recording page's order, we use page_private(page).
>   */
> -static inline int page_is_buddy(struct page *page, int order)
> +static inline int page_is_buddy(struct page *page, struct page *buddy, int order)
>  {
> -#ifdef CONFIG_HOLES_IN_ZONE
> -	if (!pfn_valid(page_to_pfn(page)))
> +	if (buddy_outside_zone(page, buddy))
>  		return 0;

This is a heck of a lot of code to be throwing into the page-freeing
hotpath.  Surely there's a way of moving all this work to
initialisation/hotadd time?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21  9:19   ` Andrew Morton
@ 2006-05-21 10:31     ` Nick Piggin
  2006-05-21 10:59       ` Andrew Morton
  2006-05-21 11:53       ` Nick Piggin
  0 siblings, 2 replies; 23+ messages in thread
From: Nick Piggin @ 2006-05-21 10:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: apw, mel, stable, linux-mm

Andrew Morton wrote:
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>>Allow unaligned zones, and make this an opt-in CONFIG_ option because
>>some architectures appear to be relying on unaligned zones being handled
>>correctly.
>>
>>- Also, the bad_range checks are removed, they are checked at meminit time
>>  since the last patch.
>>
>>...
>>
>>Index: linux-2.6/mm/page_alloc.c
>>===================================================================
>>--- linux-2.6.orig/mm/page_alloc.c	2006-05-21 17:53:36.000000000 +1000
>>+++ linux-2.6/mm/page_alloc.c	2006-05-21 18:20:13.000000000 +1000
>>
>>...
>>
>>+{
>>+#ifdef CONFIG_HOLES_IN_ZONE
> 
> 
> (Why is this a config option?  If we can optionally handle it, why not
> always just handle it?

Holes in zone? or unaligned zones? Holes in zone, I guess because
it is seen as somewhat of a special case, and can be removed if ia64
moves to sparsemem.

> 
> 
>>+/*
>>+ * If the the zone's mem_map is not 1<<MAX_ORDER aligned, CONFIG_ALIGNED_ZONE
>>+ * must *not* be set by the architecture, because the buddy allocator will run
>>+ * into "buddies" which are outside mem_map.
>>+ *
>>+ * It is not enough for the node's mem_map to be aligned, because unaligned
>>+ * zone boundaries can cause a buddies to be in different zones.
>>+ */
>>+static inline int buddy_outside_zone_span(struct page *page, struct page *buddy)
>>+{
>>+	int ret = 0;
>>+
>>+#ifndef CONFIG_ALIGNED_ZONE
>>+	unsigned int seq;
>>+	unsigned long pfn;
>>+	struct zone *zone;
>>+
>>+	pfn = page_to_pfn(page);
>>+	zone = page_zone(page);
>>+
>>+	do {
> 
> 
> You'll want a `ret = 0' here.

Thanks.

> 
> 
>>+		seq = zone_span_seqbegin(zone);
>>+		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
>>+			ret = 1;
>>+		else if (pfn < zone->zone_start_pfn)
>>+			ret = 1;
>>+	} while (zone_span_seqretry(zone, seq));
>>+	if (ret)
>>+		goto out;
>>+
>>+	/*
>>+	 * page_zone_idx accesses page->flags, so this test must go after
>>+	 * the above, which ensures that buddy is within the zone.
>>+	 */
>>+	if (page_zone_idx(page) != page_zone_idx(buddy))
>>+		ret = 1;
>>+
>>+out:
>>+#endif
>>+
>>+	return ret;
>>+}
>>+
>>+/*
>>+ * In some memory configurations, buddy pages may be found which are
>>+ * outside the zone pages. Check for those here.
>>+ */
>>+static int buddy_outside_zone(struct page *page, struct page *buddy)
>>+{
>>+	if (page_in_zone_hole(buddy))
>>+		return 1;
>>+
>>+	if (buddy_outside_zone_span(page, buddy))
>>+		return 1;
>>+
>>+	return 0;
>>+}
>>+
>>+/*
>>+ * This function checks whether a buddy is free and is the buddy of page.
>>+ * We can coalesce a page and its buddy if
>>+ * (a) the buddy is not "outside" the zone &&
>>  * (b) the buddy is in the buddy system &&
>>  * (c) a page and its buddy have the same order.
>>  *
>>@@ -292,15 +320,13 @@ __find_combined_index(unsigned long page
>>  *
>>  * For recording page's order, we use page_private(page).
>>  */
>>-static inline int page_is_buddy(struct page *page, int order)
>>+static inline int page_is_buddy(struct page *page, struct page *buddy, int order)
>> {
>>-#ifdef CONFIG_HOLES_IN_ZONE
>>-	if (!pfn_valid(page_to_pfn(page)))
>>+	if (buddy_outside_zone(page, buddy))
>> 		return 0;
> 
> 
> This is a heck of a lot of code to be throwing into the page-freeing
> hotpath.  Surely there's a way of moving all this work to
> initialisation/hotadd time?

Can't think of any good way to do it. We could add yet another page
flag, which would relegate unaligned portions of zones to only order-0
pages (and never try to merge them up the buddy allocator).

Of course that's another page flag.

It is possible we can avoid the zone seqlock checks simply by always
testing whether the pfn is valid (this way the test would be more
unified with the holes in zone case).

The tests would still be pretty heavyweight though.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21 10:31     ` Nick Piggin
@ 2006-05-21 10:59       ` Andrew Morton
  2006-05-21 11:44         ` Nick Piggin
  2006-05-21 11:53       ` Nick Piggin
  1 sibling, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2006-05-21 10:59 UTC (permalink / raw)
  To: Nick Piggin; +Cc: apw, mel, stable, linux-mm

Nick Piggin <nickpiggin@yahoo.com.au> wrote:
>
> >>+ * This function checks whether a buddy is free and is the buddy of page.
>  >>+ * We can coalesce a page and its buddy if
>  >>+ * (a) the buddy is not "outside" the zone &&
>  >>  * (b) the buddy is in the buddy system &&
>  >>  * (c) a page and its buddy have the same order.
>  >>  *
>  >>@@ -292,15 +320,13 @@ __find_combined_index(unsigned long page
>  >>  *
>  >>  * For recording page's order, we use page_private(page).
>  >>  */
>  >>-static inline int page_is_buddy(struct page *page, int order)
>  >>+static inline int page_is_buddy(struct page *page, struct page *buddy, int order)
>  >> {
>  >>-#ifdef CONFIG_HOLES_IN_ZONE
>  >>-	if (!pfn_valid(page_to_pfn(page)))
>  >>+	if (buddy_outside_zone(page, buddy))
>  >> 		return 0;
>  > 
>  > 
>  > This is a heck of a lot of code to be throwing into the page-freeing
>  > hotpath.  Surely there's a way of moving all this work to
>  > initialisation/hotadd time?
> 
>  Can't think of any good way to do it. We could add yet another page
>  flag, which would relegate unaligned portions of zones to only order-0
>  pages (and never try to merge them up the buddy allocator).
> 
>  Of course that's another page flag.
> 
>  It is possible we can avoid the zone seqlock checks simply by always
>  testing whether the pfn is valid (this way the test would be more
>  unified with the holes in zone case).
> 
>  The tests would still be pretty heavyweight though.

How about just throwing the pages away?  It sounds like a pretty rare
problem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21 10:59       ` Andrew Morton
@ 2006-05-21 11:44         ` Nick Piggin
  2006-05-21 11:52           ` Nick Piggin
  2006-05-22  9:06           ` Mel Gorman
  0 siblings, 2 replies; 23+ messages in thread
From: Nick Piggin @ 2006-05-21 11:44 UTC (permalink / raw)
  To: Andrew Morton; +Cc: apw, mel, stable, linux-mm

Andrew Morton wrote:

> How about just throwing the pages away?  It sounds like a pretty rare
> problem.

Well that's what many architectures will end up doing, yes. But on
small or embedded platforms, 4MB - 1 is a whole lot of memory to be
throwing away.

Also, I'm not sure it is something we can be doing in generic code,
because some architectures apparently have very strange zone setups
(eg. zones from several pages interleaved within a single zone's
->spanned_pages). So it doesn't sound like a simple matter of trying
to override the zones' intervals.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21 11:44         ` Nick Piggin
@ 2006-05-21 11:52           ` Nick Piggin
  2006-05-22  9:24             ` Mel Gorman
  2006-05-22  9:06           ` Mel Gorman
  1 sibling, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2006-05-21 11:52 UTC (permalink / raw)
  To: linux-mm; +Cc: Andrew Morton, apw, mel, stable

Nick Piggin wrote:
> Andrew Morton wrote:
> 
>> How about just throwing the pages away?  It sounds like a pretty rare
>> problem.
> 
> 
> Well that's what many architectures will end up doing, yes. But on
> small or embedded platforms, 4MB - 1 is a whole lot of memory to be
> throwing away.
> 
> Also, I'm not sure it is something we can be doing in generic code,
> because some architectures apparently have very strange zone setups
> (eg. zones from several pages interleaved within a single zone's
> ->spanned_pages). So it doesn't sound like a simple matter of trying
> to override the zones' intervals.

Oh I see, yeah I guess you could throw away the pages forming the
present fraction of the MAX_ORDER buddy...

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

----- End forwarded message -----

-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21 11:52           ` Nick Piggin
@ 2006-05-22  9:24             ` Mel Gorman
  2006-05-22  9:28               ` Mel Gorman
  0 siblings, 1 reply; 23+ messages in thread
From: Mel Gorman @ 2006-05-22  9:24 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, apw, stable, linux-mm

On Sun, 21 May 2006, Nick Piggin wrote:

> Nick Piggin wrote:
>> Andrew Morton wrote:
>> 
>>> How about just throwing the pages away?  It sounds like a pretty rare
>>> problem.
>> 
>> 
>> Well that's what many architectures will end up doing, yes. But on
>> small or embedded platforms, 4MB - 1 is a whole lot of memory to be
>> throwing away.
>> 
>> Also, I'm not sure it is something we can be doing in generic code,
>> because some architectures apparently have very strange zone setups
>> (eg. zones from several pages interleaved within a single zone's
>> ->spanned_pages). So it doesn't sound like a simple matter of trying
>> to override the zones' intervals.
>
> Oh I see, yeah I guess you could throw away the pages forming the
> present fraction of the MAX_ORDER buddy...
>

As Andy points out in another thread, the need to check unaligned zones is 
heavily relaxed (if not redundant) once the node_mem_map is aligned by 
patch "[PATCH 1/2] Align the node_mem_map endpoints to a MAX_ORDER 
boundary".

Once the node_mem_map is aligned, we know that we'll be checking a valid 
struct page. If the zones are not aligned, the unused struct pages forming 
the absent fraction of the MAX_ORDER buddy will be marked reserved since 
memmap_init_zone(). This will be caught by free_pages_check() and the 
buddies will not be merged.

I don't think there is any need to do these complex zone boundary checks 
once the node_mem_map is aligned for CONFIG_FLAT_NODE_MEM_MAP and 
SPARSEMEM already gets this right.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-22  9:24             ` Mel Gorman
@ 2006-05-22  9:28               ` Mel Gorman
  0 siblings, 0 replies; 23+ messages in thread
From: Mel Gorman @ 2006-05-22  9:28 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, apw, stable, linux-mm

On Mon, 22 May 2006, Mel Gorman wrote:

> On Sun, 21 May 2006, Nick Piggin wrote:
>
>> Nick Piggin wrote:
>>> Andrew Morton wrote:
>>> 
>>>> How about just throwing the pages away?  It sounds like a pretty rare
>>>> problem.
>>> 
>>> 
>>> Well that's what many architectures will end up doing, yes. But on
>>> small or embedded platforms, 4MB - 1 is a whole lot of memory to be
>>> throwing away.
>>> 
>>> Also, I'm not sure it is something we can be doing in generic code,
>>> because some architectures apparently have very strange zone setups
>>> (eg. zones from several pages interleaved within a single zone's
>>> ->spanned_pages). So it doesn't sound like a simple matter of trying
>>> to override the zones' intervals.
>> 
>> Oh I see, yeah I guess you could throw away the pages forming the
>> present fraction of the MAX_ORDER buddy...
>> 
>
> As Andy points out in another thread, the need to check unaligned zones is 
> heavily relaxed (if not redundant) once the node_mem_map is aligned by patch 
> "[PATCH 1/2] Align the node_mem_map endpoints to a MAX_ORDER boundary".
>

Sorry, this is wrong. If the zones are not aligned, we still need to check 
that page_zone() matches in page_is_buddy(). I was looking at -mm1 and 
just noticed that mainline did not have the check;

         if (page_zone_id(page) != page_zone_id(buddy))
                 return 0;

> Once the node_mem_map is aligned, we know that we'll be checking a valid 
> struct page. If the zones are not aligned, the unused struct pages forming 
> the absent fraction of the MAX_ORDER buddy will be marked reserved since 
> memmap_init_zone(). This will be caught by free_pages_check() and the buddies 
> will not be merged.
>
> I don't think there is any need to do these complex zone boundary checks once 
> the node_mem_map is aligned for CONFIG_FLAT_NODE_MEM_MAP and SPARSEMEM 
> already gets this right.
>
> -- 
> Mel Gorman
> Part-time Phd Student                          Linux Technology Center
> University of Limerick                         IBM Dublin Software Lab
>

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21 11:44         ` Nick Piggin
  2006-05-21 11:52           ` Nick Piggin
@ 2006-05-22  9:06           ` Mel Gorman
  2006-05-22  9:51             ` Nick Piggin
  1 sibling, 1 reply; 23+ messages in thread
From: Mel Gorman @ 2006-05-22  9:06 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, apw, stable, linux-mm

On Sun, 21 May 2006, Nick Piggin wrote:

> Andrew Morton wrote:
>
>> How about just throwing the pages away?  It sounds like a pretty rare
>> problem.
>
> Well that's what many architectures will end up doing, yes. But on
> small or embedded platforms, 4MB - 1 is a whole lot of memory to be
> throwing away.
>
> Also, I'm not sure it is something we can be doing in generic code,
> because some architectures apparently have very strange zone setups
> (eg. zones from several pages interleaved within a single zone's
> ->spanned_pages).

I looked through a fair few arches code that sizes zones and I couldn't 
find this or odd calls to set_page_links(). What arch interleaves pages 
between zones like this? I am taking you mean that you can have a 
situation where within one contiguous block of pages you have something 
like;

dddNNNdddNNNddd

Where d is a page in ZONE_DMA and N is a page in ZONE_NORMAL.

The oddest I've seen is where nodes interleave like on PPC64. There you 
can have pages for node 0 followed by pages for node 1 followed by node 0 
again. But the zone start and end pfns stay in the same place.

> So it doesn't sound like a simple matter of trying
> to override the zones' intervals.
>
> -- 
> SUSE Labs, Novell Inc.
> Send instant messages to your online friends http://au.messenger.yahoo.com

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-22  9:06           ` Mel Gorman
@ 2006-05-22  9:51             ` Nick Piggin
  0 siblings, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2006-05-22  9:51 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, apw, stable, linux-mm

Mel Gorman wrote:
> On Sun, 21 May 2006, Nick Piggin wrote:
> 
>> Andrew Morton wrote:
>>
>>> How about just throwing the pages away?  It sounds like a pretty rare
>>> problem.
>>
>>
>> Well that's what many architectures will end up doing, yes. But on
>> small or embedded platforms, 4MB - 1 is a whole lot of memory to be
>> throwing away.
>>
>> Also, I'm not sure it is something we can be doing in generic code,
>> because some architectures apparently have very strange zone setups
>> (eg. zones from several pages interleaved within a single zone's
>> ->spanned_pages).
> 
> 
> I looked through a fair few arches code that sizes zones and I couldn't 
> find this or odd calls to set_page_links(). What arch interleaves pages 
> between zones like this? I am taking you mean that you can have a 
> situation where within one contiguous block of pages you have something 
> like;
> 
> dddNNNdddNNNddd
> 
> Where d is a page in ZONE_DMA and N is a page in ZONE_NORMAL.
> 
> The oddest I've seen is where nodes interleave like on PPC64. There you 
> can have pages for node 0 followed by pages for node 1 followed by node 
> 0 again. But the zone start and end pfns stay in the same place.

Depending on how you look, ZONE_DMA and ZONE_NORMAL aren't always "zones" :)

I'm talking about struct zones, rather than zones-as-in-memory-classes.
So yes, PPC64 is my example. Andy's zone index check should take care
of those.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21 10:31     ` Nick Piggin
  2006-05-21 10:59       ` Andrew Morton
@ 2006-05-21 11:53       ` Nick Piggin
  1 sibling, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2006-05-21 11:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: apw, mel, stable, linux-mm

[-- Attachment #1: Type: text/plain, Size: 243 bytes --]

Nick Piggin wrote:

> It is possible we can avoid the zone seqlock checks simply by always
> testing whether the pfn is valid (this way the test would be more
> unified with the holes in zone case).

New patch 2/2.

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: mm-unaligned-zones.patch --]
[-- Type: text/plain, Size: 7550 bytes --]

Allow unaligned zones, and make this an opt-in CONFIG_ option because
some architectures appear to be relying on unaligned zones being handled
correctly.

- Also, the bad_range checks are removed, they are checked at meminit time
  since the last patch.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2006-05-21 17:53:36.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2006-05-21 20:52:55.000000000 +1000
@@ -85,55 +85,6 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
 
-#ifdef CONFIG_DEBUG_VM
-static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
-{
-	int ret = 0;
-	unsigned seq;
-	unsigned long pfn = page_to_pfn(page);
-
-	do {
-		seq = zone_span_seqbegin(zone);
-		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
-			ret = 1;
-		else if (pfn < zone->zone_start_pfn)
-			ret = 1;
-	} while (zone_span_seqretry(zone, seq));
-
-	return ret;
-}
-
-static int page_is_consistent(struct zone *zone, struct page *page)
-{
-#ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
-		return 0;
-#endif
-	if (zone != page_zone(page))
-		return 0;
-
-	return 1;
-}
-/*
- * Temporary debugging check for pages not lying within a given zone.
- */
-static int bad_range(struct zone *zone, struct page *page)
-{
-	if (page_outside_zone_boundaries(zone, page))
-		return 1;
-	if (!page_is_consistent(zone, page))
-		return 1;
-
-	return 0;
-}
-
-#else
-static inline int bad_range(struct zone *zone, struct page *page)
-{
-	return 0;
-}
-#endif
-
 static void bad_page(struct page *page)
 {
 	printk(KERN_EMERG "Bad page state in process '%s'\n"
@@ -281,9 +232,42 @@ __find_combined_index(unsigned long page
 }
 
 /*
- * This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * If the mem_map may have holes (invalid pfns) in it, which are not on
+ * MAX_ORDER<<1 aligned boundaries, CONFIG_HOLES_IN_ZONE must be set by the
+ * architecture, because the buddy allocator will otherwise attempt to access
+ * their underlying struct page when finding a buddy to merge.
+ *
+ * If the the zone's mem_map is not 1<<MAX_ORDER aligned, CONFIG_ALIGNED_ZONE
+ * must *not* be set by the architecture, because the buddy allocator will run
+ * into "buddies" which are outside mem_map. It is not enough for the node's
+ * mem_map to be aligned, because unaligned zone boundaries can cause a buddies
+ * to be in different zones.
+ */
+static inline int buddy_outside_zone(struct page *page, struct page *buddy)
+{
+#if defined(CONFIG_HOLES_IN_ZONE) || !defined(CONFIG_ALIGNED_ZONE)
+	if (!pfn_valid(page_to_pfn(buddy)))
+		return 1;
+#endif
+
+#if !defined(CONFIG_ALIGNED_ZONE)
+	/*
+	 * page_zone_idx accesses page->flags, so this test must go after
+	 * the above, which ensures that buddy is valid (and can have its
+	 * zone_idx tested).
+	 */
+	if (page_zone_idx(page) != page_zone_idx(buddy))
+		return 1;
+
+#endif
+
+	return 0;
+}
+
+/*
+ * This function checks whether a buddy is free and is the buddy of page.
+ * We can coalesce a page and its buddy if
+ * (a) the buddy is not "outside" the zone &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order.
  *
@@ -292,15 +276,17 @@ __find_combined_index(unsigned long page
  *
  * For recording page's order, we use page_private(page).
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy, int order)
 {
-#ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
+	/*
+	 * In some memory configurations, buddy pages may be found
+	 * which are outside the zone. Check for those here.
+	 */
+	if (buddy_outside_zone(page, buddy))
 		return 0;
-#endif
 
-	if (PageBuddy(page) && page_order(page) == order) {
-		BUG_ON(page_count(page) != 0);
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
@@ -342,7 +328,6 @@ static inline void __free_one_page(struc
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
 	BUG_ON(page_idx & (order_size - 1));
-	BUG_ON(bad_range(zone, page));
 
 	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
@@ -351,7 +336,7 @@ static inline void __free_one_page(struc
 		struct page *buddy;
 
 		buddy = __page_find_buddy(page, page_idx, order);
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 
 		list_del(&buddy->lru);
@@ -506,7 +491,6 @@ static inline void expand(struct zone *z
 		area--;
 		high--;
 		size >>= 1;
-		BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list);
 		area->nr_free++;
 		set_page_order(&page[size], high);
@@ -824,7 +808,6 @@ again:
 	local_irq_restore(flags);
 	put_cpu();
 
-	BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
@@ -2048,11 +2031,13 @@ static __meminit void zone_debug_checks(
 	unsigned long end = start + zone->spanned_pages;
 	const unsigned long mask = ((1<<MAX_ORDER)-1);
 	
+#ifdef CONFIG_ALIGNED_ZONE
 	if (start & mask)
 		panic("zone start pfn (%lx) not MAX_ORDER aligned\n", start);
 
 	if (end & mask)
 		panic("zone end pfn (%lx) not MAX_ORDER aligned\n", end);
+#endif
 
 	for (pfn = start; pfn < end; pfn++) {
 		struct page *page;
@@ -2068,16 +2053,29 @@ static __meminit void zone_debug_checks(
 			panic("zone page (pfn %lx) in wrong zone\n", pfn);
 
 		for (order = 0; order < MAX_ORDER-1; order++) {
+			unsigned long buddy_pfn;
 			struct page *buddy;
 			buddy = __page_find_buddy(page, pfn & mask, order);
+			buddy_pfn = page_to_pfn(buddy);
 
-#ifndef CONFIG_HOLES_IN_ZONE
-			if (!pfn_valid(page_to_pfn(buddy)))
+#if !defined(CONFIG_HOLES_IN_ZONE) && defined(CONFIG_ALIGNED_ZONE)
+			if (!pfn_valid(buddy_pfn))
 				panic("pfn (%lx) buddy (order %d) not valid\n", pfn, order);
 #endif
 
-			if (page_zone(buddy) != zone)
-				panic("pfn (%lx) buddy (order %d) in wrong zone\n", pfn, order);
+#ifdef CONFIG_ALIGNED_ZONE
+			if (buddy_pfn < start || buddy_pfn >= end)
+				panic("pfn (%lx) buddy (%lx) (order %d) outside zone\n", pfn, buddy_pfn, order);
+
+			if (zone != page_zone(buddy))
+				panic("pfn (%lx) buddy (%lx) (order %d) in different zone\n", pfn, buddy_pfn, order);
+#else
+
+			if (buddy_pfn < start || buddy_pfn >= end) {
+				if (pfn_valid(buddy_pfn) && zone == page_zone(buddy))
+					panic("pfn (%lx) buddy (%lx) (order %d) is outside the zone but page_zone would cause it to be merged\n", pfn, buddy_pfn, order);
+			}
+#endif
 		}
 	}
 }
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-19 12:48:01.000000000 +1000
+++ linux-2.6/include/linux/mm.h	2006-05-21 18:10:23.000000000 +1000
@@ -466,10 +466,14 @@ static inline unsigned long page_zonenum
 struct zone;
 extern struct zone *zone_table[];
 
+static inline unsigned long page_zone_idx(struct page *page)
+{
+	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+}
+
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
-			ZONETABLE_MASK];
+	return zone_table[page_zone_idx(page)];
 }
 
 static inline unsigned long page_to_nid(struct page *page)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-21  8:22 ` [patch 2/2] mm: handle unaligned zones Nick Piggin
  2006-05-21  9:19   ` Andrew Morton
@ 2006-05-22  8:18   ` Andy Whitcroft
  2006-05-22  9:37     ` Nick Piggin
  2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
  1 sibling, 2 replies; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-22  8:18 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Andrew Morton, Mel Gorman, stable, Linux Memory Management

Nick Piggin wrote:
> 2/2
> 
> 
> ------------------------------------------------------------------------
> 
> Allow unaligned zones, and make this an opt-in CONFIG_ option because
> some architectures appear to be relying on unaligned zones being handled
> correctly.
> 
> - Also, the bad_range checks are removed, they are checked at meminit time
>   since the last patch.
> 
> Signed-off-by: Nick Piggin <npiggin@suse.de>
> 
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c	2006-05-21 17:53:36.000000000 +1000
> +++ linux-2.6/mm/page_alloc.c	2006-05-21 18:20:13.000000000 +1000
> @@ -85,55 +85,6 @@ int min_free_kbytes = 1024;
>  unsigned long __initdata nr_kernel_pages;
>  unsigned long __initdata nr_all_pages;
>  
> -#ifdef CONFIG_DEBUG_VM
> -static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
> -{
> -	int ret = 0;
> -	unsigned seq;
> -	unsigned long pfn = page_to_pfn(page);
> -
> -	do {
> -		seq = zone_span_seqbegin(zone);
> -		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
> -			ret = 1;
> -		else if (pfn < zone->zone_start_pfn)
> -			ret = 1;
> -	} while (zone_span_seqretry(zone, seq));
> -
> -	return ret;
> -}
> -
> -static int page_is_consistent(struct zone *zone, struct page *page)
> -{
> -#ifdef CONFIG_HOLES_IN_ZONE
> -	if (!pfn_valid(page_to_pfn(page)))
> -		return 0;
> -#endif
> -	if (zone != page_zone(page))
> -		return 0;
> -
> -	return 1;
> -}
> -/*
> - * Temporary debugging check for pages not lying within a given zone.
> - */
> -static int bad_range(struct zone *zone, struct page *page)
> -{
> -	if (page_outside_zone_boundaries(zone, page))
> -		return 1;
> -	if (!page_is_consistent(zone, page))
> -		return 1;
> -
> -	return 0;
> -}
> -
> -#else
> -static inline int bad_range(struct zone *zone, struct page *page)
> -{
> -	return 0;
> -}
> -#endif
> -
>  static void bad_page(struct page *page)
>  {
>  	printk(KERN_EMERG "Bad page state in process '%s'\n"
> @@ -281,9 +232,86 @@ __find_combined_index(unsigned long page
>  }
>  
>  /*
> - * This function checks whether a page is free && is the buddy
> - * we can do coalesce a page and its buddy if
> - * (a) the buddy is not in a hole &&
> + * If the mem_map may have holes (invalid pfns) in it, which are not on
> + * MAX_ORDER<<1 aligned boundaries, CONFIG_HOLES_IN_ZONE must be set by the
> + * architecture, because the buddy allocator will otherwise attempt to access
> + * their underlying struct page when finding a buddy to merge.
> + */
> +static inline int page_in_zone_hole(struct page *page)
> +{
> +#ifdef CONFIG_HOLES_IN_ZONE
> +	/*
> +	 *
> +	 */
> +	if (!pfn_valid(page_to_pfn(page)))
> +		return 1;
> +#endif
> +	return 0;
> +}
> +
> +/*
> + * If the the zone's mem_map is not 1<<MAX_ORDER aligned, CONFIG_ALIGNED_ZONE
> + * must *not* be set by the architecture, because the buddy allocator will run
> + * into "buddies" which are outside mem_map.
> + *
> + * It is not enough for the node's mem_map to be aligned, because unaligned
> + * zone boundaries can cause a buddies to be in different zones.
> + */
> +static inline int buddy_outside_zone_span(struct page *page, struct page *buddy)
> +{
> +	int ret = 0;
> +
> +#ifndef CONFIG_ALIGNED_ZONE
> +	unsigned int seq;
> +	unsigned long pfn;
> +	struct zone *zone;
> +
> +	pfn = page_to_pfn(page);
> +	zone = page_zone(page);
> +
> +	do {
> +
> +		seq = zone_span_seqbegin(zone);
> +		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
> +			ret = 1;
> +		else if (pfn < zone->zone_start_pfn)
> +			ret = 1;
> +	} while (zone_span_seqretry(zone, seq));
> +	if (ret)
> +		goto out;
> +
> +	/*
> +	 * page_zone_idx accesses page->flags, so this test must go after
> +	 * the above, which ensures that buddy is within the zone.
> +	 */
> +	if (page_zone_idx(page) != page_zone_idx(buddy))
> +		ret = 1;

Ok.  I agree that that unaligned zones should be opt-in, it always was
that way before and as the code stands now we are only adding in a
couple of shifts, ands, and a comparison to cachelines which will be
needed in the common case in the next few lines.  I'll drop a patch I've
been using in testing to making the option that way round here following
up to this email.

However, this patch here seems redundant.  The requirement from the
buddy allocator has been an aligned node_mem_map out to MAX_ORDER either
side of the zones in that node.  With the recent patch from Bob Picco it
is now allocated that way always.  So we will always have a page* from
either the adjoining zone or from the node_mem_map padding to examine
when we are looking for a buddy to coelesce with.  It should always be
safe to examine that page*'s flags to see if its free to coelesce.  For
pages outside any zone PG_buddy will never be true, for those in another
zone the page_zone_idx() check is sufficient.

With the page_zone_idx check enabled and the node_mem_map aligned, I
cannot see why we would also need to check the zone pfn numbers too?  If
we did need to check them, then there would be no benefit in checking
the page_zone_idx as that check would always succeed.

I think the smallest, lightest weight set of changes for this problem is
the node_mem_map alignement patch from Bob Picco, plus the changes to
add just the page_zone_idx checks to the allocator.  If the stack that
makes this an opt-out option is too large, a two liner to check just
page_zone_idx always would be a good option for stable.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/2] mm: handle unaligned zones
  2006-05-22  8:18   ` Andy Whitcroft
@ 2006-05-22  9:37     ` Nick Piggin
  2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
  1 sibling, 0 replies; 23+ messages in thread
From: Nick Piggin @ 2006-05-22  9:37 UTC (permalink / raw)
  To: Andy Whitcroft; +Cc: Andrew Morton, Mel Gorman, stable, Linux Memory Management

Andy Whitcroft wrote:

> Ok.  I agree that that unaligned zones should be opt-in, it always was

Yes.

> However, this patch here seems redundant.  The requirement from the
> buddy allocator has been an aligned node_mem_map out to MAX_ORDER either
> side of the zones in that node.  With the recent patch from Bob Picco it
> is now allocated that way always.  So we will always have a page* from
> either the adjoining zone or from the node_mem_map padding to examine
> when we are looking for a buddy to coelesce with.  It should always be
> safe to examine that page*'s flags to see if its free to coelesce.  For
> pages outside any zone PG_buddy will never be true, for those in another
> zone the page_zone_idx() check is sufficient.

That's true - does this cover all architectures that do not define
CONFIG_HOLES_IN_ZONE ?

> With the page_zone_idx check enabled and the node_mem_map aligned, I
> cannot see why we would also need to check the zone pfn numbers too?  If
> we did need to check them, then there would be no benefit in checking
> the page_zone_idx as that check would always succeed.

Yes. BTW. are the struct pages outside the nodes going to be correctly
aligned? Either way, I think we should also check that everything has
been set up in the way we expect at meminit time (see my debug function).

> 
> I think the smallest, lightest weight set of changes for this problem is
> the node_mem_map alignement patch from Bob Picco, plus the changes to
> add just the page_zone_idx checks to the allocator.  If the stack that

Yes, that sounds fine.

> makes this an opt-out option is too large, a two liner to check just
> page_zone_idx always would be a good option for stable.

I think it is more a question of time for all arch maintainers to verify
rather than size.

If you just mean: you want to negate the meaning of the CONFIG_ option,
and go through and define it in all architectures, I'd be fine with that
too (by opt-in I just mean the check should be turned on until proven
otherwise)

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-22  8:18   ` Andy Whitcroft
  2006-05-22  9:37     ` Nick Piggin
@ 2006-05-22  9:52     ` Andy Whitcroft
  2006-05-22  9:53       ` [PATCH 1/2] zone allow unaligned zone boundaries add configuration Andy Whitcroft
                         ` (3 more replies)
  1 sibling, 4 replies; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-22  9:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, Mel Gorman, stable, Linux Memory Management

I think a concensus is forming that the checks for merging across
zones were removed from the buddy allocator without anyone noticing.
So I propose that the configuration option UNALIGNED_ZONE_BOUNDARIES
default to on, and those architectures which have been auditied
for alignment may turn it off.

Following this email are two patches:

zone-allow-unaligned-zone-boundaries-add-configuration -- adding
  the configuration option.

x86-add-zone-alignment-qualifier -- marking x86 as enforcing alignment.

Cheers.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/2] zone allow unaligned zone boundaries add configuration
  2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
@ 2006-05-22  9:53       ` Andy Whitcroft
  2006-05-22  9:53       ` [PATCH 2/2] x86 add zone alignment qualifier Andy Whitcroft
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-22  9:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, Mel Gorman, stable, Linux Memory Management

zone allow unaligned zone boundaries add configuration

Add a configuration definition for UNALIGNED_ZONE_BOUNDARIES.  Default
to on unless the architecture indicates that it ensures that the boundaries
are correctly aligned.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
---
 Kconfig |   13 +++++++++++++
 1 files changed, 13 insertions(+)
diff -upN reference/mm/Kconfig current/mm/Kconfig
--- reference/mm/Kconfig
+++ current/mm/Kconfig
@@ -145,3 +145,16 @@ config MIGRATION
 	  while the virtual addresses are not changed. This is useful for
 	  example on NUMA systems to put pages nearer to the processors accessing
 	  the page.
+
+#
+# Support for buddy zone boundaries within a MAX_ORDER sized area.
+#
+config UNALIGNED_ZONE_BOUNDARIES
+	bool "Unaligned zone boundaries"
+	default n if ARCH_ALIGNED_ZONE_BOUNDARIES
+	default y
+	help
+	  Adds checks to the buddy allocator to ensure we do not
+	  coalesce buddies across zone boundaries.  The default
+	  should be correct for your architecture.  Enable this if
+	  you are having trouble and you are requested to in dmesg.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/2] x86 add zone alignment qualifier
  2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
  2006-05-22  9:53       ` [PATCH 1/2] zone allow unaligned zone boundaries add configuration Andy Whitcroft
@ 2006-05-22  9:53       ` Andy Whitcroft
  2006-05-25 11:19       ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
  2006-05-31  0:13       ` [stable] " Chris Wright
  3 siblings, 0 replies; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-22  9:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nick Piggin, Mel Gorman, stable, Linux Memory Management

x86 add zone alignment qualifier

x86 takes steps to ensure all of its zones are aligned.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
---
 Kconfig |    3 +++
 1 files changed, 3 insertions(+)
diff -upN reference/arch/i386/Kconfig current/arch/i386/Kconfig
--- reference/arch/i386/Kconfig
+++ current/arch/i386/Kconfig
@@ -577,6 +577,9 @@ config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on ARCH_SPARSEMEM_ENABLE
 
+config ARCH_ALIGNED_ZONE_BOUNDARIES
+	def_bool y
+
 source "mm/Kconfig"
 
 config HAVE_ARCH_EARLY_PFN_TO_NID

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
  2006-05-22  9:53       ` [PATCH 1/2] zone allow unaligned zone boundaries add configuration Andy Whitcroft
  2006-05-22  9:53       ` [PATCH 2/2] x86 add zone alignment qualifier Andy Whitcroft
@ 2006-05-25 11:19       ` Andy Whitcroft
  2006-05-31  0:13       ` [stable] " Chris Wright
  3 siblings, 0 replies; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-25 11:19 UTC (permalink / raw)
  To: Andy Whitcroft
  Cc: Andrew Morton, Nick Piggin, Mel Gorman, stable, Linux Memory Management

[Hmmmm, just received this back from our mailers, thanks a lot!
I thought it was odd to hear total silence.  Anyhow, heres trying
that again.]

There has been much confusion over what is and what is not needed to
ensure we do not merge buddies across zone boundaries.  So I thought
I would try and put down my view of the world and how I think the
fixes out there work together.  I feel that having this all in one
place will help clarify the problem and the proposed solution.

First the assumptions that the buddy allocator is making:

1) that the buddy for any page it is offered can simply be calculated
   from the pfn of that page, and

2) that the page* for the buddy for any page it is offered can
   be examined to see if it is free without referencing the node
   boundaries.

The practical up shot of that is we require the the:

1) mem_map is contigious for any MAX_ORDER span of pages, and

2) mem_map is valid out to MAX_ORDER from any page within a zone.

Let examine a worse case example where we have both nodes which touch
in the middle of a MAX_ORDER range, and have zones with boundaries
the same.  In our hypothetical machine we have MAX_ORDER at 2 so we
have 4 pages in each MAX_ORDER range and we will run two nodes one
from pfn 2->9 and a second from 10->17.  We will have two zones,
the first 4 pages of each node are in a separate zone.  I include
the NODEZONE calculations (the index into the zonetable used to
locate the zone structure from a page) for both FLATMEM/DISCONTIGMEM
and for SPARSEMEM.  I will consider the case where we allocate
independant node_mem_map arrays for each node (maps 1 and 2) and
the ia64 single contigious case (map 3).

      PFN   0     2        5  6        9 10       13 14       17    19
          |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
MAX_ORDER |-----------|-----------|-----------|-----------|-----------|
     NODE |     |-----------0-----------|-----------1-----------|     |
     ZONE |     |-----A-----|-----B-----|-----C-----|-----D-----|     |
          |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
 FLAT/DIS |00|00|00|00|00|00|01|01|01|01|10|10|10|10|11|11|11|11|00|00|
   SPARSE |00|00|00|00|10|10|11|11|21|21|20|20|30|30|31|31|41|41|00|00|
MEM_MAP 1 |ZZZZZ-------------------------ZZZZZ|
MEM_MAP 2                         |ZZZZZ-------------------------ZZZZZ|
MEM_MAP 3 |ZZZZZ-------------------------------------------------ZZZZZ|

Here I am assuming that we have UNALIGNED_ZONE_BOUNDARIES enabled
and therefore have the page_zone_idx(page) != page_zone_idx(buddy)
check.  So lets consider the worst cases freeing a page which wants
to coelesce a buddy which:

1) is below the start of node 0 (page 2),
2) is in another zone (pages 4 and 6),
3) is in another node (pages 8 and 10), and
4) is above the end of node 1 (page 16).

Scenario 1: buddy is below the start of node -- here we are freeing
page 2 we will need to examine buddy page 0 to see if it is free and
whether it is in the same zone.  As the mem_map is zero filled the
page will never be free, PG_buddy is not set.  We cannot coelesce.

Scenario 2: buddy is in another zone -- here we are freeing page
4 (or 6) we will need to examine buddy page 6 (or 4).  If we
assume worst case and the page is free, PG_buddy is set, we then
compare the NODEZONE's.  Here they differ (00 != 01 or 10 != 11).
We cannot coelesce.

Scenario 3a (multiple mem_map case): buddy is in another node -- here
are freeing page 8 (or 10) we will need to examine buddy page 10
(or 8).  As the mem_map is zero filled the page will never appear
free in the node local mem_map.  We cannot coelesce.

Scenario 3b (single mem_map case): buddy is in another node -- here
are freeing page 8 (or 10) we will need to examine buddy page 10
(or 8).  If we assume worst case and the page is free, PG_buddy is
set, we then compare NODEZONES's.  Here they differ (01 != 10 or
21 != 20).  We cannot coelesce.

Scenario 4: buddy is above the end of node -- here we are freeing
page 16 we will need to examine buddy page 18.  As the mem_map
is zero filled the page will never be free, PG_buddy is not set.
We cannot coelesce.

It is important to note that for this to work correctly in the case
where we have missaligned nodes in the single mem_map case we must
compare the page_zone_idx over the page_zone_id to ensure we detect
the node transition in the case where we only have a single zone
in each node.

I hope this clarifies things, please yell if you can see a hole
in this.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [stable] [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
                         ` (2 preceding siblings ...)
  2006-05-25 11:19       ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
@ 2006-05-31  0:13       ` Chris Wright
  2006-05-31 11:41         ` Nick Piggin
  2006-05-31 17:16         ` Andy Whitcroft
  3 siblings, 2 replies; 23+ messages in thread
From: Chris Wright @ 2006-05-31  0:13 UTC (permalink / raw)
  To: Andy Whitcroft
  Cc: Andrew Morton, Mel Gorman, Nick Piggin, stable, Linux Memory Management

* Andy Whitcroft (apw@shadowen.org) wrote:
> I think a concensus is forming that the checks for merging across
> zones were removed from the buddy allocator without anyone noticing.
> So I propose that the configuration option UNALIGNED_ZONE_BOUNDARIES
> default to on, and those architectures which have been auditied
> for alignment may turn it off.

So what's the final outcome here for -stable?  The only
relevant patch upstream appears to be Bob Picco's patch
<http://kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=e984bb43f7450312ba66fe0e67a99efa6be3b246>

thanks,
-chris

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [stable] [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-31  0:13       ` [stable] " Chris Wright
@ 2006-05-31 11:41         ` Nick Piggin
  2006-05-31 12:08           ` Andy Whitcroft
  2006-05-31 17:16         ` Andy Whitcroft
  1 sibling, 1 reply; 23+ messages in thread
From: Nick Piggin @ 2006-05-31 11:41 UTC (permalink / raw)
  To: Chris Wright
  Cc: Andy Whitcroft, Andrew Morton, Mel Gorman, stable,
	Linux Memory Management

Chris Wright wrote:
> * Andy Whitcroft (apw@shadowen.org) wrote:
> 
>>I think a concensus is forming that the checks for merging across
>>zones were removed from the buddy allocator without anyone noticing.
>>So I propose that the configuration option UNALIGNED_ZONE_BOUNDARIES
>>default to on, and those architectures which have been auditied
>>for alignment may turn it off.
> 
> 
> So what's the final outcome here for -stable?  The only
> relevant patch upstream appears to be Bob Picco's patch

I think you need zone checks? [ ie. page_zone(page) == page_zone(buddy) ]
I had assumed Andy was going to do a patch for that.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [stable] [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-31 11:41         ` Nick Piggin
@ 2006-05-31 12:08           ` Andy Whitcroft
  2006-05-31 17:42             ` Greg KH
  0 siblings, 1 reply; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-31 12:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Chris Wright, Andrew Morton, Mel Gorman, stable, Linux Memory Management

Nick Piggin wrote:
> Chris Wright wrote:
> 
>> * Andy Whitcroft (apw@shadowen.org) wrote:
>>
>>> I think a concensus is forming that the checks for merging across
>>> zones were removed from the buddy allocator without anyone noticing.
>>> So I propose that the configuration option UNALIGNED_ZONE_BOUNDARIES
>>> default to on, and those architectures which have been auditied
>>> for alignment may turn it off.
>>
>>
>>
>> So what's the final outcome here for -stable?  The only
>> relevant patch upstream appears to be Bob Picco's patch
> 
> 
> I think you need zone checks? [ ie. page_zone(page) == page_zone(buddy) ]
> I had assumed Andy was going to do a patch for that.

The stack for the full optional check in -mm seems like a lot for a
stable patch.  I think for stable we should just add the check for
unconditionally, its very light weight and safe that way.  Am just
putting together a patch for that now.  Will respond to this email
shortly with that patch once its been through a few tests.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [stable] [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-31 12:08           ` Andy Whitcroft
@ 2006-05-31 17:42             ` Greg KH
  0 siblings, 0 replies; 23+ messages in thread
From: Greg KH @ 2006-05-31 17:42 UTC (permalink / raw)
  To: Andy Whitcroft
  Cc: Nick Piggin, Chris Wright, Mel Gorman, stable, Linux Memory Management

On Wed, May 31, 2006 at 01:08:05PM +0100, Andy Whitcroft wrote:
> Nick Piggin wrote:
> > Chris Wright wrote:
> > 
> >> * Andy Whitcroft (apw@shadowen.org) wrote:
> >>
> >>> I think a concensus is forming that the checks for merging across
> >>> zones were removed from the buddy allocator without anyone noticing.
> >>> So I propose that the configuration option UNALIGNED_ZONE_BOUNDARIES
> >>> default to on, and those architectures which have been auditied
> >>> for alignment may turn it off.
> >>
> >>
> >>
> >> So what's the final outcome here for -stable?  The only
> >> relevant patch upstream appears to be Bob Picco's patch
> > 
> > 
> > I think you need zone checks? [ ie. page_zone(page) == page_zone(buddy) ]
> > I had assumed Andy was going to do a patch for that.
> 
> The stack for the full optional check in -mm seems like a lot for a
> stable patch.  I think for stable we should just add the check for
> unconditionally, its very light weight and safe that way.  Am just
> putting together a patch for that now.  Will respond to this email
> shortly with that patch once its been through a few tests.

But one of the -stable rules is that it fixes a real problem that people
are having, not just a theoretical one.  Does this classify as such?

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [stable] [PATCH 0/2] Zone boundary alignment fixes, default configuration
  2006-05-31  0:13       ` [stable] " Chris Wright
  2006-05-31 11:41         ` Nick Piggin
@ 2006-05-31 17:16         ` Andy Whitcroft
  1 sibling, 0 replies; 23+ messages in thread
From: Andy Whitcroft @ 2006-05-31 17:16 UTC (permalink / raw)
  To: Chris Wright
  Cc: Andrew Morton, Mel Gorman, Nick Piggin, stable, Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 1326 bytes --]

Chris Wright wrote:
> * Andy Whitcroft (apw@shadowen.org) wrote:
> 
>>I think a concensus is forming that the checks for merging across
>>zones were removed from the buddy allocator without anyone noticing.
>>So I propose that the configuration option UNALIGNED_ZONE_BOUNDARIES
>>default to on, and those architectures which have been auditied
>>for alignment may turn it off.
> 
> 
> So what's the final outcome here for -stable?  The only
> relevant patch upstream appears to be Bob Picco's patch
> <http://kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=e984bb43f7450312ba66fe0e67a99efa6be3b246>

I am not sure we necessarily need to make any changes for stable.  The
lack of alignment checks has been in the mainline tree for a number of
months.  I believe that i386 in the simple cases should be aligned
correctly and that covers the majority of users.

If we are going to make any changes then I'd say we want two patches.
The node_mem_map alignment patch from Bob Picco (as cited above) and the
attached patch.  This is a simplification of the patches currently in
-mm, it should be functionally equivalent to the changes in -mm without
the exclusions and configuration options.

I've just run a regression suite over this one on the machines I have
here without any problems.

Comments?

-apw

[-- Attachment #2: zone-allow-unaligned-zone-boundaries-for-2616-stable --]
[-- Type: text/plain, Size: 3547 bytes --]

From: Andy Whitcroft <apw@shadowen.org>

[Minimal fix for unaligned zone boundaries for stable.]

The buddy allocator has a requirement that boundaries between
contigious zones occur aligned with the the MAX_ORDER ranges.  Where
they do not we will incorrectly merge pages cross zone boundaries.
This can lead to pages from the wrong zone being handed out.

Originally the buddy allocator would check that buddies were in the
same zone by referencing the zone start and end page frame numbers.
This was removed as it became very expensive and the buddy allocator
already made the assumption that zones boundaries were aligned.

It is clear that not all configurations and architectures are
honouring this alignment requirement.  Therefore it seems safest
to reintroduce support for non-aligned zone boundaries.  

This patch introduces a new check when considering a page a buddy
it compares the zone_table index for the two pages and refuses to
merge the pages where they do not match.  The zone_table index is
unique for each node/zone combination when FLATMEM/DISCONTIGMEM
is enabled and for each section/zone combination when SPARSEMEM is
enabled (a SPARSEMEM section is at least a MAX_ORDER size).

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
---
 include/linux/mm.h |    7 +++++--
 mm/page_alloc.c    |   17 +++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)
diff -upN reference/include/linux/mm.h current/include/linux/mm.h
--- reference/include/linux/mm.h
+++ current/include/linux/mm.h
@@ -464,10 +464,13 @@ static inline unsigned long page_zonenum
 struct zone;
 extern struct zone *zone_table[];
 
+static inline int page_zone_id(struct page *page)
+{
+	return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+}
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
-			ZONETABLE_MASK];
+	return zone_table[page_zone_id(page)];
 }
 
 static inline unsigned long page_to_nid(struct page *page)
diff -upN reference/mm/page_alloc.c current/mm/page_alloc.c
--- reference/mm/page_alloc.c
+++ current/mm/page_alloc.c
@@ -270,22 +270,27 @@ __find_combined_index(unsigned long page
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we use PG_buddy.
  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+								int order)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
-	if (!pfn_valid(page_to_pfn(page)))
+	if (!pfn_valid(page_to_pfn(buddy)))
 		return 0;
 #endif
 
-	if (PageBuddy(page) && page_order(page) == order) {
-		BUG_ON(page_count(page) != 0);
+	if (page_zone_id(page) != page_zone_id(buddy))
+		return 0;
+
+	if (PageBuddy(buddy) && page_order(buddy) == order) {
+		BUG_ON(page_count(buddy) != 0);
                return 1;
 	}
        return 0;
@@ -336,7 +341,7 @@ static inline void __free_one_page(struc
 		struct page *buddy;
 
 		buddy = __page_find_buddy(page, page_idx, order);
-		if (!page_is_buddy(buddy, order))
+		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 
 		list_del(&buddy->lru);

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2006-05-31 17:42 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-05-21  8:22 [patch 1/2] mm: detect bad zones Nick Piggin
2006-05-21  8:22 ` [patch 2/2] mm: handle unaligned zones Nick Piggin
2006-05-21  9:19   ` Andrew Morton
2006-05-21 10:31     ` Nick Piggin
2006-05-21 10:59       ` Andrew Morton
2006-05-21 11:44         ` Nick Piggin
2006-05-21 11:52           ` Nick Piggin
2006-05-22  9:24             ` Mel Gorman
2006-05-22  9:28               ` Mel Gorman
2006-05-22  9:06           ` Mel Gorman
2006-05-22  9:51             ` Nick Piggin
2006-05-21 11:53       ` Nick Piggin
2006-05-22  8:18   ` Andy Whitcroft
2006-05-22  9:37     ` Nick Piggin
2006-05-22  9:52     ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
2006-05-22  9:53       ` [PATCH 1/2] zone allow unaligned zone boundaries add configuration Andy Whitcroft
2006-05-22  9:53       ` [PATCH 2/2] x86 add zone alignment qualifier Andy Whitcroft
2006-05-25 11:19       ` [PATCH 0/2] Zone boundary alignment fixes, default configuration Andy Whitcroft
2006-05-31  0:13       ` [stable] " Chris Wright
2006-05-31 11:41         ` Nick Piggin
2006-05-31 12:08           ` Andy Whitcroft
2006-05-31 17:42             ` Greg KH
2006-05-31 17:16         ` Andy Whitcroft

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox