* Page allocator: Single Zone optimizations
@ 2006-10-17 0:50 Christoph Lameter
2006-10-17 1:10 ` Andrew Morton
2006-10-17 1:27 ` KAMEZAWA Hiroyuki
0 siblings, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-10-17 0:50 UTC (permalink / raw)
To: akpm; +Cc: linux-mm
The current code in 2.6.19-rc1-mm1 already allows the configuration of a
system with a single zone. We observed significant performance gains which
were likely due to the reduced cache footprint (removal of the zone_table
also contributed).
This patch continues that line of work making the zone protection logic
optional throwing out moreVM overhead that is not needed in the single
zone case (which hopefully in the far future most of us will be able to
use).
Also several macros can become constant if we know that only
a single zone exists (ZONES_SHIFT == 0) which will remove more code
from the VM and avoid runtime branching.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.19-rc1-mm1/mm/vmstat.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/mm/vmstat.c 2006-10-16 03:42:57.322493498 -0500
+++ linux-2.6.19-rc1-mm1/mm/vmstat.c 2006-10-16 19:08:27.244098681 -0500
@@ -554,7 +554,7 @@ static int zoneinfo_show(struct seq_file
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i));
-
+#if ZONES_SHIFT > 0
seq_printf(m,
"\n protection: (%lu",
zone->lowmem_reserve[0]);
@@ -563,6 +563,7 @@ static int zoneinfo_show(struct seq_file
seq_printf(m,
")"
"\n pagesets");
+#endif
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
int j;
Index: linux-2.6.19-rc1-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/mm/page_alloc.c 2006-10-16 03:43:05.976552770 -0500
+++ linux-2.6.19-rc1-mm1/mm/page_alloc.c 2006-10-16 19:32:56.838407647 -0500
@@ -59,6 +59,7 @@ int percpu_pagelist_fraction;
static void __free_pages_ok(struct page *page, unsigned int order);
+#if ZONES_SHIFT > 0
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +82,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
32
#endif
};
+#endif
EXPORT_SYMBOL(totalram_pages);
@@ -922,8 +924,11 @@ int zone_watermark_ok(struct zone *z, in
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
-
+#if ZONES_SHIFT == 0
+ if (free_pages <= min)
+#else
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+#endif
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1429,8 +1434,6 @@ void show_free_areas(void)
global_page_state(NR_PAGETABLE));
for_each_zone(zone) {
- int i;
-
if (!populated_zone(zone))
continue;
@@ -1457,10 +1460,15 @@ void show_free_areas(void)
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
);
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %lu", zone->lowmem_reserve[i]);
- printk("\n");
+#if ZONES_SHIFT > 0
+ {
+ int i;
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %lu", zone->lowmem_reserve[i]);
+ printk("\n");
+ }
+#endif
}
for_each_zone(zone) {
@@ -2829,25 +2837,38 @@ void __init page_alloc_init(void)
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
* or min_free_kbytes changes.
*/
+static unsigned long calculate_max_lowmem_reserve(struct zone *zone,
+ enum zone_type start)
+{
+#if ZONES_SHIFT > 0
+ unsigned long max;
+ enum zone_type i;
+
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (i = start; i < MAX_NR_ZONES; i++) {
+ if (zone->lowmem_reserve[i] > max)
+ max = zone->lowmem_reserve[i];
+ }
+ return max;
+#else
+ return 0;
+#endif
+}
+
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
- enum zone_type i, j;
+ enum zone_type i;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- unsigned long max = 0;
-
- /* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ unsigned long max;
/* we treat pages_high as reserved pages. */
- max += zone->pages_high;
+ max = calculate_max_lowmem_reserve(zone, i) + \
+ zone->pages_high;
if (max > zone->present_pages)
max = zone->present_pages;
@@ -2865,6 +2886,7 @@ static void calculate_totalreserve_pages
*/
static void setup_per_zone_lowmem_reserve(void)
{
+#if ZONES_SHIFT > 0
struct pglist_data *pgdat;
enum zone_type j, idx;
@@ -2894,6 +2916,7 @@ static void setup_per_zone_lowmem_reserv
/* update totalreserve_pages */
calculate_totalreserve_pages();
+#endif
}
/**
@@ -3044,6 +3067,7 @@ int sysctl_min_slab_ratio_sysctl_handler
}
#endif
+#if ZONES_SHIFT > 0
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
@@ -3060,6 +3084,7 @@ int lowmem_reserve_ratio_sysctl_handler(
setup_per_zone_lowmem_reserve();
return 0;
}
+#endif
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
Index: linux-2.6.19-rc1-mm1/kernel/sysctl.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/kernel/sysctl.c 2006-10-16 03:42:57.340073124 -0500
+++ linux-2.6.19-rc1-mm1/kernel/sysctl.c 2006-10-16 19:08:27.368132684 -0500
@@ -900,6 +900,7 @@ static ctl_table vm_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#if ZONES_SHIFT > 0
{
.ctl_name = VM_LOWMEM_RESERVE_RATIO,
.procname = "lowmem_reserve_ratio",
@@ -909,6 +910,7 @@ static ctl_table vm_table[] = {
.proc_handler = &lowmem_reserve_ratio_sysctl_handler,
.strategy = &sysctl_intvec,
},
+#endif
{
.ctl_name = VM_DROP_PAGECACHE,
.procname = "drop_caches",
Index: linux-2.6.19-rc1-mm1/include/linux/mmzone.h
===================================================================
--- linux-2.6.19-rc1-mm1.orig/include/linux/mmzone.h 2006-10-16 03:43:05.966786311 -0500
+++ linux-2.6.19-rc1-mm1/include/linux/mmzone.h 2006-10-16 19:24:44.378702936 -0500
@@ -175,6 +175,7 @@ struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+#if ZONES_SHIFT > 0
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
@@ -184,6 +185,7 @@ struct zone {
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
unsigned long lowmem_reserve[MAX_NR_ZONES];
+#endif
#ifdef CONFIG_NUMA
int node;
@@ -420,11 +422,19 @@ unsigned long __init node_memmap_size_by
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
+#if ZONES_SHIFT > 0
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+#else
+#define zone_idx(zone) ZONE_NORMAL
+#endif
static inline int populated_zone(struct zone *zone)
{
+#if ZONES_SHIFT > 0
return (!!zone->present_pages);
+#else
+ return 1;
+#endif
}
static inline int is_highmem_idx(enum zone_type idx)
@@ -438,7 +448,11 @@ static inline int is_highmem_idx(enum zo
static inline int is_normal_idx(enum zone_type idx)
{
+#if ZONES_SHIFT > 0
return (idx == ZONE_NORMAL);
+#else
+ return 1;
+#endif
}
/**
@@ -458,7 +472,11 @@ static inline int is_highmem(struct zone
static inline int is_normal(struct zone *zone)
{
+#if ZONES_SHIFT > 0
return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
+#else
+ return 1;
+#endif
}
static inline int is_dma32(struct zone *zone)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-17 0:50 Page allocator: Single Zone optimizations Christoph Lameter
@ 2006-10-17 1:10 ` Andrew Morton
2006-10-17 1:13 ` Christoph Lameter
2006-10-17 1:27 ` KAMEZAWA Hiroyuki
1 sibling, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-10-17 1:10 UTC (permalink / raw)
To: Christoph Lameter; +Cc: linux-mm
On Mon, 16 Oct 2006 17:50:26 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> The current code in 2.6.19-rc1-mm1 already allows the configuration of a
> system with a single zone. We observed significant performance gains which
> were likely due to the reduced cache footprint (removal of the zone_table
> also contributed).
>
> This patch continues that line of work making the zone protection logic
> optional throwing out moreVM overhead that is not needed in the single
> zone case (which hopefully in the far future most of us will be able to
> use).
>
> Also several macros can become constant if we know that only
> a single zone exists (ZONES_SHIFT == 0) which will remove more code
> from the VM and avoid runtime branching.
akpm:/home/akpm> grep '^+#if' x
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT == 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
+#if ZONES_SHIFT > 0
Now that just goes too far.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-17 0:50 Page allocator: Single Zone optimizations Christoph Lameter
2006-10-17 1:10 ` Andrew Morton
@ 2006-10-17 1:27 ` KAMEZAWA Hiroyuki
2006-10-17 1:25 ` Christoph Lameter
1 sibling, 1 reply; 83+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-10-17 1:27 UTC (permalink / raw)
To: Christoph Lameter; +Cc: akpm, linux-mm
On Mon, 16 Oct 2006 17:50:26 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> +#if ZONES_SHIFT > 0
> seq_printf(m,
> "\n protection: (%lu",
> zone->lowmem_reserve[0]);
> @@ -563,6 +563,7 @@ static int zoneinfo_show(struct seq_file
> seq_printf(m,
> ")"
> "\n pagesets");
> +#endif
How about defining following instead of inserting #ifdefs ?
#ifdef ZONES_SHIFT > 0
#define zone_lowmem_reserve(z, i) ((z)->lowmem_reserve[(i)])
#else
#define zone_lowmem_reserve(z, i) (0)
#endif
and removing #if's from *.c files ? Can't this be help ?
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-17 1:27 ` KAMEZAWA Hiroyuki
@ 2006-10-17 1:25 ` Christoph Lameter
2006-10-17 6:04 ` Nick Piggin
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-10-17 1:25 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: akpm, linux-mm
On Tue, 17 Oct 2006, KAMEZAWA Hiroyuki wrote:
> How about defining following instead of inserting #ifdefs ?
>
> #ifdef ZONES_SHIFT > 0
> #define zone_lowmem_reserve(z, i) ((z)->lowmem_reserve[(i)])
> #else
> #define zone_lowmem_reserve(z, i) (0)
> #endif
>
> and removing #if's from *.c files ? Can't this be help ?
Well it only shifts the #ifdef elsewhere....
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-17 1:25 ` Christoph Lameter
@ 2006-10-17 6:04 ` Nick Piggin
2006-10-17 17:54 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Nick Piggin @ 2006-10-17 6:04 UTC (permalink / raw)
To: Christoph Lameter; +Cc: KAMEZAWA Hiroyuki, akpm, linux-mm
Christoph Lameter wrote:
>On Tue, 17 Oct 2006, KAMEZAWA Hiroyuki wrote:
>
>
>>How about defining following instead of inserting #ifdefs ?
>>
>>#ifdef ZONES_SHIFT > 0
>>#define zone_lowmem_reserve(z, i) ((z)->lowmem_reserve[(i)])
>>#else
>>#define zone_lowmem_reserve(z, i) (0)
>>#endif
>>
>>and removing #if's from *.c files ? Can't this be help ?
>>
>
>Well it only shifts the #ifdef elsewhere....
>
Shifting this out of the caller like this tends to be the accepted
way of doing it. It does tend to be more readable.
I would give an ack to Kame's approach for lowmem_reserve ;)
--
Send instant messages to your online friends http://au.messenger.yahoo.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-17 6:04 ` Nick Piggin
@ 2006-10-17 17:54 ` Christoph Lameter
2006-10-18 11:15 ` Nick Piggin
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-10-17 17:54 UTC (permalink / raw)
To: Nick Piggin; +Cc: KAMEZAWA Hiroyuki, akpm, linux-mm
On Tue, 17 Oct 2006, Nick Piggin wrote:
> I would give an ack to Kame's approach for lowmem_reserve ;)
Hmmm... One could define a constant in mmzone.h to get rid of lots of
these ifdefs:
Single Zone Optimizations
If we only have a single zone then various macros can be optimized.
We do not need to protect higher zones, we know that zones are
always present, can remove useless data from /proc etc etc.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.19-rc1-mm1/mm/vmstat.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/mm/vmstat.c 2006-10-17 07:27:45.419872964 -0500
+++ linux-2.6.19-rc1-mm1/mm/vmstat.c 2006-10-17 07:38:00.880502313 -0500
@@ -554,15 +554,16 @@ static int zoneinfo_show(struct seq_file
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i));
-
- seq_printf(m,
- "\n protection: (%lu",
- zone->lowmem_reserve[0]);
- for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
+ if (MULTI_ZONE) {
+ seq_printf(m,
+ "\n protection: (%lu",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
+ }
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
int j;
Index: linux-2.6.19-rc1-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/mm/page_alloc.c 2006-10-17 07:27:45.191337093 -0500
+++ linux-2.6.19-rc1-mm1/mm/page_alloc.c 2006-10-17 07:36:49.124176302 -0500
@@ -59,6 +59,7 @@ int percpu_pagelist_fraction;
static void __free_pages_ok(struct page *page, unsigned int order);
+#if MULTI_ZONE
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +82,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
32
#endif
};
+#endif
EXPORT_SYMBOL(totalram_pages);
@@ -922,8 +924,7 @@ int zone_watermark_ok(struct zone *z, in
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
-
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + MULTI_ZONE * z->lowmem_reserve[classzone_idx])
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1429,8 +1430,6 @@ void show_free_areas(void)
global_page_state(NR_PAGETABLE));
for_each_zone(zone) {
- int i;
-
if (!populated_zone(zone))
continue;
@@ -1457,10 +1456,14 @@ void show_free_areas(void)
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
);
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %lu", zone->lowmem_reserve[i]);
- printk("\n");
+ if (MULTI_ZONE) {
+ int i;
+
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %lu", zone->lowmem_reserve[i]);
+ printk("\n");
+ }
}
for_each_zone(zone) {
@@ -2829,25 +2832,36 @@ void __init page_alloc_init(void)
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
* or min_free_kbytes changes.
*/
+static unsigned long calculate_max_lowmem_reserve(struct zone *zone,
+ enum zone_type start)
+{
+ unsigned long max;
+ enum zone_type i;
+
+ if (SINGLE_ZONE)
+ return 0;
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (i = start; i < MAX_NR_ZONES; i++) {
+ if (zone->lowmem_reserve[i] > max)
+ max = zone->lowmem_reserve[i];
+ }
+ return max;
+}
+
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
- enum zone_type i, j;
+ enum zone_type i;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- unsigned long max = 0;
-
- /* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ unsigned long max;
/* we treat pages_high as reserved pages. */
- max += zone->pages_high;
+ max = calculate_max_lowmem_reserve(zone, i) + \
+ zone->pages_high;
if (max > zone->present_pages)
max = zone->present_pages;
@@ -2868,6 +2882,9 @@ static void setup_per_zone_lowmem_reserv
struct pglist_data *pgdat;
enum zone_type j, idx;
+ if (SINGLE_ZONE)
+ return 0;
+
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -3044,6 +3061,7 @@ int sysctl_min_slab_ratio_sysctl_handler
}
#endif
+#if MULTI_ZONE
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
@@ -3060,6 +3078,7 @@ int lowmem_reserve_ratio_sysctl_handler(
setup_per_zone_lowmem_reserve();
return 0;
}
+#endif
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
Index: linux-2.6.19-rc1-mm1/kernel/sysctl.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/kernel/sysctl.c 2006-10-17 07:27:44.692269445 -0500
+++ linux-2.6.19-rc1-mm1/kernel/sysctl.c 2006-10-17 07:38:47.977425889 -0500
@@ -900,6 +900,7 @@ static ctl_table vm_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#if MULTI_ZONE
{
.ctl_name = VM_LOWMEM_RESERVE_RATIO,
.procname = "lowmem_reserve_ratio",
@@ -909,6 +910,7 @@ static ctl_table vm_table[] = {
.proc_handler = &lowmem_reserve_ratio_sysctl_handler,
.strategy = &sysctl_intvec,
},
+#endif
{
.ctl_name = VM_DROP_PAGECACHE,
.procname = "drop_caches",
Index: linux-2.6.19-rc1-mm1/include/linux/mmzone.h
===================================================================
--- linux-2.6.19-rc1-mm1.orig/include/linux/mmzone.h 2006-10-17 07:27:42.478206116 -0500
+++ linux-2.6.19-rc1-mm1/include/linux/mmzone.h 2006-10-17 07:34:40.134279959 -0500
@@ -171,6 +171,9 @@ enum zone_type {
#endif
#undef __ZONE_COUNT
+#define MULTI_ZONE (ZONES_SHIFT > 0)
+#define SINGLE_ZONE (ZONES_SHIFT == 0)
+
struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
@@ -183,7 +186,7 @@ struct zone {
* on the higher zones). This array is recalculated at runtime if the
* sysctl_lowmem_reserve_ratio sysctl changes.
*/
- unsigned long lowmem_reserve[MAX_NR_ZONES];
+ unsigned long lowmem_reserve[MAX_NR_ZONES - SINGLE_ZONE];
#ifdef CONFIG_NUMA
int node;
@@ -420,11 +423,11 @@ unsigned long __init node_memmap_size_by
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
-#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+#define zone_idx(zone) (MULTI_ZONE * ((zone) - (zone)->zone_pgdat->node_zones))
static inline int populated_zone(struct zone *zone)
{
- return (!!zone->present_pages);
+ return SINGLE_ZONE || (!!zone->present_pages);
}
static inline int is_highmem_idx(enum zone_type idx)
@@ -438,7 +441,7 @@ static inline int is_highmem_idx(enum zo
static inline int is_normal_idx(enum zone_type idx)
{
- return (idx == ZONE_NORMAL);
+ return SINGLE_ZONE || (idx == ZONE_NORMAL);
}
/**
@@ -458,7 +461,8 @@ static inline int is_highmem(struct zone
static inline int is_normal(struct zone *zone)
{
- return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
+ return SINGLE_ZONE ||
+ zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
}
static inline int is_dma32(struct zone *zone)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-17 17:54 ` Christoph Lameter
@ 2006-10-18 11:15 ` Nick Piggin
2006-10-18 19:38 ` Andrew Morton
0 siblings, 1 reply; 83+ messages in thread
From: Nick Piggin @ 2006-10-18 11:15 UTC (permalink / raw)
To: Christoph Lameter; +Cc: KAMEZAWA Hiroyuki, akpm, linux-mm
Christoph Lameter wrote:
> On Tue, 17 Oct 2006, Nick Piggin wrote:
>
>
>>I would give an ack to Kame's approach for lowmem_reserve ;)
>
>
> @@ -922,8 +924,7 @@ int zone_watermark_ok(struct zone *z, in
> min -= min / 2;
> if (alloc_flags & ALLOC_HARDER)
> min -= min / 4;
> -
> - if (free_pages <= min + z->lowmem_reserve[classzone_idx])
> + if (free_pages <= min + MULTI_ZONE * z->lowmem_reserve[classzone_idx])
I think I still prefer having this in a header. It is the normal way of doing it.
> return 0;
> for (o = 0; o < order; o++) {
> /* At the next order, this order's pages become unavailable */
> @@ -1429,8 +1430,6 @@ void show_free_areas(void)
> global_page_state(NR_PAGETABLE));
>
> for_each_zone(zone) {
> - int i;
> -
> if (!populated_zone(zone))
> continue;
>
> @@ -1457,10 +1456,14 @@ void show_free_areas(void)
> zone->pages_scanned,
> (zone->all_unreclaimable ? "yes" : "no")
> );
> - printk("lowmem_reserve[]:");
> - for (i = 0; i < MAX_NR_ZONES; i++)
> - printk(" %lu", zone->lowmem_reserve[i]);
> - printk("\n");
> + if (MULTI_ZONE) {
> + int i;
> +
> + printk("lowmem_reserve[]:");
> + for (i = 0; i < MAX_NR_ZONES; i++)
> + printk(" %lu", zone->lowmem_reserve[i]);
> + printk("\n");
> + }
> }
>
> for_each_zone(zone) {
> @@ -2829,25 +2832,36 @@ void __init page_alloc_init(void)
> * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
> * or min_free_kbytes changes.
> */
> +static unsigned long calculate_max_lowmem_reserve(struct zone *zone,
> + enum zone_type start)
> +{
> + unsigned long max;
> + enum zone_type i;
> +
> + if (SINGLE_ZONE)
> + return 0;
> + /* Find valid and maximum lowmem_reserve in the zone */
> + for (i = start; i < MAX_NR_ZONES; i++) {
> + if (zone->lowmem_reserve[i] > max)
> + max = zone->lowmem_reserve[i];
> + }
> + return max;
> +}
> +
> static void calculate_totalreserve_pages(void)
> {
> struct pglist_data *pgdat;
> unsigned long reserve_pages = 0;
> - enum zone_type i, j;
> + enum zone_type i;
>
> for_each_online_pgdat(pgdat) {
> for (i = 0; i < MAX_NR_ZONES; i++) {
> struct zone *zone = pgdat->node_zones + i;
> - unsigned long max = 0;
> -
> - /* Find valid and maximum lowmem_reserve in the zone */
> - for (j = i; j < MAX_NR_ZONES; j++) {
> - if (zone->lowmem_reserve[j] > max)
> - max = zone->lowmem_reserve[j];
> - }
> + unsigned long max;
>
> /* we treat pages_high as reserved pages. */
> - max += zone->pages_high;
> + max = calculate_max_lowmem_reserve(zone, i) + \
> + zone->pages_high;
>
> if (max > zone->present_pages)
> max = zone->present_pages;
> @@ -2868,6 +2882,9 @@ static void setup_per_zone_lowmem_reserv
> struct pglist_data *pgdat;
> enum zone_type j, idx;
>
> + if (SINGLE_ZONE)
> + return 0;
> +
> for_each_online_pgdat(pgdat) {
> for (j = 0; j < MAX_NR_ZONES; j++) {
> struct zone *zone = pgdat->node_zones + j;
> @@ -3044,6 +3061,7 @@ int sysctl_min_slab_ratio_sysctl_handler
> }
> #endif
>
> +#if MULTI_ZONE
> /*
> * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
> * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
> @@ -3060,6 +3078,7 @@ int lowmem_reserve_ratio_sysctl_handler(
> setup_per_zone_lowmem_reserve();
> return 0;
> }
> +#endif
>
> /*
> * percpu_pagelist_fraction - changes the pcp->high for each zone on each
> Index: linux-2.6.19-rc1-mm1/kernel/sysctl.c
> ===================================================================
> --- linux-2.6.19-rc1-mm1.orig/kernel/sysctl.c 2006-10-17 07:27:44.692269445 -0500
> +++ linux-2.6.19-rc1-mm1/kernel/sysctl.c 2006-10-17 07:38:47.977425889 -0500
> @@ -900,6 +900,7 @@ static ctl_table vm_table[] = {
> .proc_handler = &proc_dointvec,
> },
> #endif
> +#if MULTI_ZONE
> {
> .ctl_name = VM_LOWMEM_RESERVE_RATIO,
> .procname = "lowmem_reserve_ratio",
> @@ -909,6 +910,7 @@ static ctl_table vm_table[] = {
> .proc_handler = &lowmem_reserve_ratio_sysctl_handler,
> .strategy = &sysctl_intvec,
> },
> +#endif
> {
> .ctl_name = VM_DROP_PAGECACHE,
> .procname = "drop_caches",
> Index: linux-2.6.19-rc1-mm1/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.19-rc1-mm1.orig/include/linux/mmzone.h 2006-10-17 07:27:42.478206116 -0500
> +++ linux-2.6.19-rc1-mm1/include/linux/mmzone.h 2006-10-17 07:34:40.134279959 -0500
> @@ -171,6 +171,9 @@ enum zone_type {
> #endif
> #undef __ZONE_COUNT
>
> +#define MULTI_ZONE (ZONES_SHIFT > 0)
> +#define SINGLE_ZONE (ZONES_SHIFT == 0)
> +
> struct zone {
> /* Fields commonly accessed by the page allocator */
> unsigned long free_pages;
> @@ -183,7 +186,7 @@ struct zone {
> * on the higher zones). This array is recalculated at runtime if the
> * sysctl_lowmem_reserve_ratio sysctl changes.
> */
> - unsigned long lowmem_reserve[MAX_NR_ZONES];
> + unsigned long lowmem_reserve[MAX_NR_ZONES - SINGLE_ZONE];
IMO ifdef is much better here. It is more readable and gives better
typechecking. I think ifdef in structures is not much of a problem.
>
> #ifdef CONFIG_NUMA
> int node;
> @@ -420,11 +423,11 @@ unsigned long __init node_memmap_size_by
> /*
> * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
> */
> -#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
> +#define zone_idx(zone) (MULTI_ZONE * ((zone) - (zone)->zone_pgdat->node_zones))
>
> static inline int populated_zone(struct zone *zone)
> {
> - return (!!zone->present_pages);
> + return SINGLE_ZONE || (!!zone->present_pages);
> }
>
> static inline int is_highmem_idx(enum zone_type idx)
> @@ -438,7 +441,7 @@ static inline int is_highmem_idx(enum zo
>
> static inline int is_normal_idx(enum zone_type idx)
> {
> - return (idx == ZONE_NORMAL);
> + return SINGLE_ZONE || (idx == ZONE_NORMAL);
> }
>
> /**
> @@ -458,7 +461,8 @@ static inline int is_highmem(struct zone
>
> static inline int is_normal(struct zone *zone)
> {
> - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
> + return SINGLE_ZONE ||
> + zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
> }
I don't know if these are any better than ifdef elseif endif. I think
the goal is not ifdef removal at any cost, but avoiding ifdefs in
complex functions and within control flow because it makes the code
less readable.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-18 11:15 ` Nick Piggin
@ 2006-10-18 19:38 ` Andrew Morton
2006-10-23 23:08 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-10-18 19:38 UTC (permalink / raw)
To: Nick Piggin; +Cc: Christoph Lameter, KAMEZAWA Hiroyuki, linux-mm
On Wed, 18 Oct 2006 21:15:35 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> > @@ -458,7 +461,8 @@ static inline int is_highmem(struct zone
> >
> > static inline int is_normal(struct zone *zone)
> > {
> > - return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
> > + return SINGLE_ZONE ||
> > + zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
> > }
>
> I don't know if these are any better than ifdef elseif endif. I think
> the goal is not ifdef removal at any cost, but avoiding ifdefs in
> complex functions and within control flow because it makes the code
> less readable.
Certainly readability is a concern.
But the other problem with ifdefs is
#ifdef SOMETHING_WHICH_IS_USUALLY_DEFINED
stuff_which_works();
#else
stuff_which_doesnt_compile_or_which_generates_warnings();
#endif
And we do that quite a lot.
Whereas
if (SOMETHING_WHICH_IS_ZERO_OR_ONE)
stuff_which_works();
else
stuff_which_doesnt_compile_or_which_generates_warnings();
not only loooks heaps better, but the compiler checks it all for us too.
But you knew all that.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-18 19:38 ` Andrew Morton
@ 2006-10-23 23:08 ` Christoph Lameter
2006-10-24 1:07 ` Christoph Lameter
2006-10-26 22:09 ` Andrew Morton
0 siblings, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-10-23 23:08 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
Single Zone Optimizations V2
V1->V2 Use a config variable setup im mm/KConfig
If we only have a single zone then various macros can be optimized.
We do not need to protect higher zones, we know that zones are
always present, can remove useless data from /proc etc etc. Various
code paths become unnecessary with a single zone setup.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.19-rc2-mm2/mm/vmstat.c
===================================================================
--- linux-2.6.19-rc2-mm2.orig/mm/vmstat.c 2006-10-23 17:51:51.816819354 -0500
+++ linux-2.6.19-rc2-mm2/mm/vmstat.c 2006-10-23 17:52:35.777558863 -0500
@@ -554,15 +554,16 @@ static int zoneinfo_show(struct seq_file
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
zone_page_state(zone, i));
-
- seq_printf(m,
- "\n protection: (%lu",
- zone->lowmem_reserve[0]);
- for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
- seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
+ if (CONFIG_MULTI_ZONE) {
+ seq_printf(m,
+ "\n protection: (%lu",
+ zone->lowmem_reserve[0]);
+ for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+ seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+ seq_printf(m,
+ ")"
+ "\n pagesets");
+ }
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
int j;
Index: linux-2.6.19-rc2-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.19-rc2-mm2.orig/mm/page_alloc.c 2006-10-23 17:51:51.824632513 -0500
+++ linux-2.6.19-rc2-mm2/mm/page_alloc.c 2006-10-23 17:52:35.819554594 -0500
@@ -60,6 +60,7 @@ int percpu_pagelist_fraction;
static void __free_pages_ok(struct page *page, unsigned int order);
+#if CONFIG_MULTI_ZONE
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -82,6 +83,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
32
#endif
};
+#endif
EXPORT_SYMBOL(totalram_pages);
@@ -923,8 +925,8 @@ int zone_watermark_ok(struct zone *z, in
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
-
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + CONFIG_MULTI_ZONE *
+ z->lowmem_reserve[classzone_idx])
return 0;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1581,8 +1583,6 @@ void show_free_areas(void)
global_page_state(NR_PAGETABLE));
for_each_zone(zone) {
- int i;
-
if (!populated_zone(zone))
continue;
@@ -1609,10 +1609,14 @@ void show_free_areas(void)
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
);
- printk("lowmem_reserve[]:");
- for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %lu", zone->lowmem_reserve[i]);
- printk("\n");
+ if (CONFIG_MULTI_ZONE) {
+ int i;
+
+ printk("lowmem_reserve[]:");
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ printk(" %lu", zone->lowmem_reserve[i]);
+ printk("\n");
+ }
}
for_each_zone(zone) {
@@ -3011,25 +3015,37 @@ void __init page_alloc_init(void)
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
* or min_free_kbytes changes.
*/
+static unsigned long calculate_max_lowmem_reserve(struct zone *zone,
+ enum zone_type start)
+{
+ unsigned long max;
+ enum zone_type i;
+
+ if (!CONFIG_MULTI_ZONE)
+ return 0;
+
+ /* Find valid and maximum lowmem_reserve in the zone */
+ for (i = start; i < MAX_NR_ZONES; i++) {
+ if (zone->lowmem_reserve[i] > max)
+ max = zone->lowmem_reserve[i];
+ }
+ return max;
+}
+
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
- enum zone_type i, j;
+ enum zone_type i;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- unsigned long max = 0;
-
- /* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ unsigned long max;
/* we treat pages_high as reserved pages. */
- max += zone->pages_high;
+ max = calculate_max_lowmem_reserve(zone, i) + \
+ zone->pages_high;
if (max > zone->present_pages)
max = zone->present_pages;
@@ -3050,6 +3066,9 @@ static void setup_per_zone_lowmem_reserv
struct pglist_data *pgdat;
enum zone_type j, idx;
+ if (!CONFIG_MULTI_ZONE)
+ return 0;
+
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -3226,6 +3245,7 @@ int sysctl_min_slab_ratio_sysctl_handler
}
#endif
+#if CONFIG_MULTI_ZONE
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
@@ -3242,6 +3262,7 @@ int lowmem_reserve_ratio_sysctl_handler(
setup_per_zone_lowmem_reserve();
return 0;
}
+#endif
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
Index: linux-2.6.19-rc2-mm2/kernel/sysctl.c
===================================================================
--- linux-2.6.19-rc2-mm2.orig/kernel/sysctl.c 2006-10-23 17:51:51.852955214 -0500
+++ linux-2.6.19-rc2-mm2/kernel/sysctl.c 2006-10-23 17:52:35.863503614 -0500
@@ -904,6 +904,7 @@ static ctl_table vm_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#if CONFIG_MULTI_ZONE
{
.ctl_name = VM_LOWMEM_RESERVE_RATIO,
.procname = "lowmem_reserve_ratio",
@@ -913,6 +914,7 @@ static ctl_table vm_table[] = {
.proc_handler = &lowmem_reserve_ratio_sysctl_handler,
.strategy = &sysctl_intvec,
},
+#endif
{
.ctl_name = VM_DROP_PAGECACHE,
.procname = "drop_caches",
Index: linux-2.6.19-rc2-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.19-rc2-mm2.orig/include/linux/mmzone.h 2006-10-23 17:51:51.879324626 -0500
+++ linux-2.6.19-rc2-mm2/include/linux/mmzone.h 2006-10-23 17:52:35.882059867 -0500
@@ -491,11 +491,12 @@ unsigned long __init node_memmap_size_by
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
-#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+#define zone_idx(zone) (CONFIG_MULTI_ZONE * \
+ ((zone) - (zone)->zone_pgdat->node_zones))
static inline int populated_zone(struct zone *zone)
{
- return (!!zone->present_pages);
+ return !CONFIG_MULTI_ZONE || (!!zone->present_pages);
}
static inline int is_highmem_idx(enum zone_type idx)
@@ -509,7 +510,7 @@ static inline int is_highmem_idx(enum zo
static inline int is_normal_idx(enum zone_type idx)
{
- return (idx == ZONE_NORMAL);
+ return !CONFIG_MULTI_ZONE || (idx == ZONE_NORMAL);
}
/**
@@ -529,7 +530,8 @@ static inline int is_highmem(struct zone
static inline int is_normal(struct zone *zone)
{
- return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
+ return !CONFIG_MULTI_ZONE ||
+ zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
}
static inline int is_dma32(struct zone *zone)
Index: linux-2.6.19-rc2-mm2/mm/Kconfig
===================================================================
--- linux-2.6.19-rc2-mm2.orig/mm/Kconfig 2006-10-23 17:52:25.537437185 -0500
+++ linux-2.6.19-rc2-mm2/mm/Kconfig 2006-10-23 17:52:35.890849671 -0500
@@ -248,3 +248,7 @@ config ZONE_DMA_FLAG
default "0" if !ZONE_DMA
default "1"
+config MULTI_ZONE
+ int
+ default "1"
+ default "0" if !ZONE_DMA && !ZONE_DMA32 && !HIGHMEM
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-23 23:08 ` Christoph Lameter
@ 2006-10-24 1:07 ` Christoph Lameter
2006-10-26 22:09 ` Andrew Morton
1 sibling, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-10-24 1:07 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
Fix return of a value in function returning void.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.19-rc2-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.19-rc2-mm2.orig/mm/page_alloc.c 2006-10-23 17:57:53.000000000 -0500
+++ linux-2.6.19-rc2-mm2/mm/page_alloc.c 2006-10-23 20:05:44.146460919 -0500
@@ -3071,7 +3071,7 @@ static void setup_per_zone_lowmem_reserv
enum zone_type j, idx;
if (!CONFIG_MULTI_ZONE)
- return 0;
+ return;
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-23 23:08 ` Christoph Lameter
2006-10-24 1:07 ` Christoph Lameter
@ 2006-10-26 22:09 ` Andrew Morton
2006-10-26 22:28 ` Christoph Lameter
` (2 more replies)
1 sibling, 3 replies; 83+ messages in thread
From: Andrew Morton @ 2006-10-26 22:09 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Mon, 23 Oct 2006 16:08:20 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> Single Zone Optimizations V2
>
> V1->V2 Use a config variable setup im mm/KConfig
>
> If we only have a single zone then various macros can be optimized.
> We do not need to protect higher zones, we know that zones are
> always present, can remove useless data from /proc etc etc. Various
> code paths become unnecessary with a single zone setup.
I don't know about all of this. It's making core mm increasingly revolting
and increases dissimilarities between different kernel builds and generally
makes it harder for us to remotely diagnose and solve people's bug reports.
Harder to understand architecture A's behaviour based upon one's knowledge
of architecture B, etc.
I really really want to drop all those patches[1] and rethink it all.
Like... would it make sense to eliminate the hard-coded concepts of DMA,
DMA32, NORMAL and HIGHMEM and simply say "we support 1 to N zones" per
node? Obviously we'd need to keep the DMA/NORMAL/HIGHMEM nomenclature in
the interfaces so the rest of the kernel builds and works, but the core mm
just shouldn't need to care: all it cares about is one or more zones.
Or something like that. Something which makes the mm easier to understand,
easier to maintain and faster. Rather than harder to understand, harder to
maintain and faster.
[1] These:
get-rid-of-zone_table.patch
deal-with-cases-of-zone_dma-meaning-the-first-zone.patch
get-rid-of-zone_table-fix-3.patch
introduce-config_zone_dma.patch
optional-zone_dma-in-the-vm.patch
optional-zone_dma-in-the-vm-no-gfp_dma-check-in-the-slab-if-no-config_zone_dma-is-set.patch
optional-zone_dma-for-ia64.patch
remove-zone_dma-remains-from-parisc.patch
remove-zone_dma-remains-from-sh-sh64.patch
set-config_zone_dma-for-arches-with-generic_isa_dma.patch
zoneid-fix-up-calculations-for-zoneid_pgshift.patch
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-26 22:09 ` Andrew Morton
@ 2006-10-26 22:28 ` Christoph Lameter
2006-10-28 1:00 ` Christoph Lameter
2006-11-01 17:39 ` Mel Gorman
2 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-10-26 22:28 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Thu, 26 Oct 2006, Andrew Morton wrote:
> get-rid-of-zone_table.patch
That patch has nothing to do with this. It only conflicted with the other
patches below.
Let me think a bit about the other ideas that you brought up.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-26 22:09 ` Andrew Morton
2006-10-26 22:28 ` Christoph Lameter
@ 2006-10-28 1:00 ` Christoph Lameter
2006-10-28 2:04 ` Andrew Morton
2006-11-01 17:39 ` Mel Gorman
2 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-10-28 1:00 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Thu, 26 Oct 2006, Andrew Morton wrote:
> I really really want to drop all those patches[1] and rethink it all.
I think it would be good to do drop it. That may allow a consolidation of
the patches after the experience we have had so far with it but there is
also the danger that I may have to drop it completely for now since the
project is getting to be too much of an effort. Having to repeat the same
arguments is not that productive.
Note that the zone_table work is independent from the ZONE_DMA work.
So please keep the get-rid-of-zone_table.patch. Just drop the rest.
get-rid-of-zone_table-fix-3.patch is not really a fix for the zone_table
patch but addresses an issue created by the optional zone_dma patch.
> Like... would it make sense to eliminate the hard-coded concepts of DMA,
> DMA32, NORMAL and HIGHMEM and simply say "we support 1 to N zones" per
> node? Obviously we'd need to keep the DMA/NORMAL/HIGHMEM nomenclature in
> the interfaces so the rest of the kernel builds and works, but the core mm
> just shouldn't need to care: all it cares about is one or more zones.
Ok. Recap of some of the earlier discussion:
- DMA has no clearly defined boundaries. They vary according to arch
and many arches / platforms depend on particular ZONE_DMA semantics.
- DMA32 is only supported for x86_64 and has a particular role
to play there.
- Highmem requires special treatment with kmap that is different from
all others.
In order to have N zones (I think you would want that to cover
different restricted DMA areas?) one would need to have some sort of
standard definition and purposes for those N zones. They would need to be
able to be treated in the same way. For the ZONE_DMAxx zones you may be
able to get there. HIGHMEM is definitely much different.
Then you would probably want to support the dma_mask supported by the SCSI
subsystem and dma_alloc_coherent functions to allow arbitrary bitmasks. In
order to support that with zones we would need a large quantity of those
or a way to dynamically create zones. I am pretty sure this will not
simplify mm. There is a potential here for increasing fragmentation and
getting into complicated load balancing situations between the zones.
A number of architectures and platforms (I think we are up to 8
to 10 or so?) do not need what ZONE_DMA provides and can avoid having to
deal with this mess right now if we allow an opt out as provided by my
current patches in mm. No additional measures would be needed.
For those platforms that still need the abiltity to allocate from a
subse of memory it would be possible to provide a page allocator
function where one can specify an allowed memory range. That would
avoid the need for various DMA style zones.
But I cannot find any justification in my contexts to complete work on
this functionality because plainly all the hardware that I use does not
have problem laden DMA controllers and works just fine with a single
zone. This includes x86_64, i386 and ia64 boxes that I test my patches on.
I would have to find time to research this and test with such a device.
So far I have not found a way to justify taking time for that beyond the
initial RFC that I posted a while back.
> Or something like that. Something which makes the mm easier to understand,
> easier to maintain and faster. Rather than harder to understand, harder to
> maintain and faster.
The simplest approach is to allow configurations with a single zone. That
makes mm easier to understand, faster and maintainable. For that purpose
functionality provided by specialized zones like ZONE_HIGHMEM, ZONE_DMA32
and ZONE_DMA needs to be isolated and made configurable. I have done that
for HIGHEMEM and DMA32 and the code is in 2.6.19.
The point of the patches is to do the same thing for ZONE_DMA.
There are many other subsystems that add special DMA overhead like in the
slab allocators etc. On platforms that do not need ZONE_DMA we
currently just see empty counters, create dead slabs, have dead code etc.
This seems where I ran into trouble since it seems that you think it gets
too complicated to have the ability to compile a kernel without the
useless and problematic GFP_DMA. ZONE_DMA material.
I think just the opposite is happening. The patches separate out ZONE_DMA
functionality that is badly defined, not standardized, rarely used and has
caused lots of weird code in the kernel to be written. Ever seen the code
in some arches alloc_dma_coherent where they allocate a page and then
check if its in a certain range? If not more creative artistry follows.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 1:00 ` Christoph Lameter
@ 2006-10-28 2:04 ` Andrew Morton
2006-10-28 2:12 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-10-28 2:04 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Fri, 27 Oct 2006 18:00:42 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> But I cannot find any justification in my contexts to complete work on
> this functionality because plainly all the hardware that I use does not
> have problem laden DMA controllers and works just fine with a single
> zone.
How about memory hot-unplug?
The only feasible way we're going to implement that is to support it on
user allocations only. IOW: for all those allocations which were performed
with __GFP_HIGHMEM.
(This is an overloading of the GFP_HIGHMEM concept, but it happens to be a
very accurate one. Perhaps we should have a separate __GFP_UNPLUGGABLE).
> This includes x86_64, i386 and ia64 boxes that I test my patches on.
> I would have to find time to research this and test with such a device.
> So far I have not found a way to justify taking time for that beyond the
> initial RFC that I posted a while back.
>
> > Or something like that. Something which makes the mm easier to understand,
> > easier to maintain and faster. Rather than harder to understand, harder to
> > maintain and faster.
>
> The simplest approach is to allow configurations with a single zone. That
> makes mm easier to understand, faster and maintainable. For that purpose
> functionality provided by specialized zones like ZONE_HIGHMEM, ZONE_DMA32
> and ZONE_DMA needs to be isolated and made configurable. I have done that
> for HIGHEMEM and DMA32 and the code is in 2.6.19.
>
> The point of the patches is to do the same thing for ZONE_DMA.
>
> There are many other subsystems that add special DMA overhead like in the
> slab allocators etc. On platforms that do not need ZONE_DMA we
> currently just see empty counters, create dead slabs, have dead code etc.
> This seems where I ran into trouble since it seems that you think it gets
> too complicated to have the ability to compile a kernel without the
> useless and problematic GFP_DMA. ZONE_DMA material.
>
> I think just the opposite is happening. The patches separate out ZONE_DMA
> functionality that is badly defined, not standardized, rarely used and has
> caused lots of weird code in the kernel to be written. Ever seen the code
> in some arches alloc_dma_coherent where they allocate a page and then
> check if its in a certain range? If not more creative artistry follows.
One way to address the dma problem is to always split all memory into
log2(physical memory) zones. So we have one zone for pages 0 and 1,
another zone for pages 2 and 3, another zone for pages 4, 5, 6, and 7,
another for pages 8, 9, 10, ... 15, etc.
So each zone represents one additional bit of physical address. So a
device driver can just ask "give me a page below physical address N".
A 4GB machine would have 32-log2(PAGE_SIZE) = 20 zones. We'd coalesce the
lowest 16MB, which takes us down to 8 zones. 13 zones on a 128GB machine.
Did I do all the arith correctly? If so, it sounds feasible.
So all the GFP_DMA/NORMAL/HIGHMEM/DMA32 stuff goes away in favour of
alloc_pages_below(int log2_address, int order) or whatever.
What effect would NUMA have on all this? Not much, I suspect.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 2:04 ` Andrew Morton
@ 2006-10-28 2:12 ` Christoph Lameter
2006-10-28 2:24 ` Andrew Morton
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-10-28 2:12 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Fri, 27 Oct 2006, Andrew Morton wrote:
> On Fri, 27 Oct 2006 18:00:42 -0700 (PDT)
> Christoph Lameter <clameter@sgi.com> wrote:
>
> > But I cannot find any justification in my contexts to complete work on
> > this functionality because plainly all the hardware that I use does not
> > have problem laden DMA controllers and works just fine with a single
> > zone.
>
> How about memory hot-unplug?
Cannot figure out how that relates to what I said above. Memory hot unplug
seems to have been dropped in favor of baloons.
> The only feasible way we're going to implement that is to support it on
> user allocations only. IOW: for all those allocations which were performed
> with __GFP_HIGHMEM.
The alloc_page_range() functionality was intended for device drivers and
other ZONE_DMA users. I am not sure what the point is of user space
having the ability to allocate memory in specific physical memory areas.
User space has virtual address areas that are mapped by the kernel to
physical addresses. The physical addresses for DMA is allocated through
alloc_dma_coherent.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 2:12 ` Christoph Lameter
@ 2006-10-28 2:24 ` Andrew Morton
2006-10-28 2:31 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-10-28 2:24 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Fri, 27 Oct 2006 19:12:16 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> On Fri, 27 Oct 2006, Andrew Morton wrote:
>
> > On Fri, 27 Oct 2006 18:00:42 -0700 (PDT)
> > Christoph Lameter <clameter@sgi.com> wrote:
> >
> > > But I cannot find any justification in my contexts to complete work on
> > > this functionality because plainly all the hardware that I use does not
> > > have problem laden DMA controllers and works just fine with a single
> > > zone.
> >
> > How about memory hot-unplug?
>
> Cannot figure out how that relates to what I said above.
We need some way of preventing unreclaimable kernel memory allocations from
using certain physical pages. That means zones.
> Memory hot unplug
> seems to have been dropped in favor of baloons.
Has it? I don't recall seeing a vague proposal, let alone an implementation?
> > The only feasible way we're going to implement that is to support it on
> > user allocations only. IOW: for all those allocations which were performed
> > with __GFP_HIGHMEM.
>
> The alloc_page_range() functionality was intended for device drivers and
> other ZONE_DMA users. I am not sure what the point is of user space
> having the ability to allocate memory in specific physical memory areas.
Userspace allocations are reclaimable: pagecache, anonymous memory. These
happen to be allocated with __GFP_HIGHMEM set.
So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
that it is safe to satisfy this request from removeable memory.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 2:24 ` Andrew Morton
@ 2006-10-28 2:31 ` Christoph Lameter
2006-10-28 4:43 ` Andrew Morton
2006-11-01 18:13 ` Mel Gorman
0 siblings, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-10-28 2:31 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Fri, 27 Oct 2006, Andrew Morton wrote:
> We need some way of preventing unreclaimable kernel memory allocations from
> using certain physical pages. That means zones.
Well then we may need zones for defragmentation and zeroed pages as well
etc etc. The problem is that such things make the VM much more
complex and not simpler and faster.
> > Memory hot unplug
> > seems to have been dropped in favor of baloons.
>
> Has it? I don't recall seeing a vague proposal, let alone an implementation?
That is the impression that I got at the OLS. There were lots of talks
about baloons approaches.
> Userspace allocations are reclaimable: pagecache, anonymous memory. These
> happen to be allocated with __GFP_HIGHMEM set.
On certain platforms yes.
> So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
> that it is safe to satisfy this request from removeable memory.
OK this works on i386 but most other platforms wont have a highmem
zone.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 2:31 ` Christoph Lameter
@ 2006-10-28 4:43 ` Andrew Morton
2006-10-28 7:47 ` KAMEZAWA Hiroyuki
` (2 more replies)
2006-11-01 18:13 ` Mel Gorman
1 sibling, 3 replies; 83+ messages in thread
From: Andrew Morton @ 2006-10-28 4:43 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Fri, 27 Oct 2006 19:31:20 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> On Fri, 27 Oct 2006, Andrew Morton wrote:
>
> > We need some way of preventing unreclaimable kernel memory allocations from
> > using certain physical pages. That means zones.
>
> Well then we may need zones for defragmentation and zeroed pages as well
> etc etc. The problem is that such things make the VM much more
> complex and not simpler and faster.
Right. We need zones for lots and lots of things. This all comes back to
my main point: the hardwired and magical DMA, DMA32, NORMAL and HIGHMEM
zones don't cut it. We'd be well-served by implementing the core MM as
just "one or more zones". The placement, sizing and *meaning* behind those
zones is externally defined.
> > > Memory hot unplug
> > > seems to have been dropped in favor of baloons.
> >
> > Has it? I don't recall seeing a vague proposal, let alone an implementation?
>
> That is the impression that I got at the OLS. There were lots of talks
> about baloons approaches.
That's all virtual machine stuff, where the "kernel"'s memory is virtual,
not physical.
> > Userspace allocations are reclaimable: pagecache, anonymous memory. These
> > happen to be allocated with __GFP_HIGHMEM set.
>
> On certain platforms yes.
On _all_ platforms. See GFP_HIGHUSER.
The only exception here is highpte.
> > So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
> > that it is safe to satisfy this request from removeable memory.
>
> OK this works on i386 but most other platforms wont have a highmem
> zone.
Under this proposal platforms which wish to implement physical hot-unplug
would need to effectively implement highmem. They won't keep to kmap the
pages to access their contents, but they will need to ensure that
unreclaimable allocations be constrained to the non-removable physical
memory.
It's all pretty simple. But it'd be hacky to implement it in terms of
"highmem". It would be better if we could just tell the core MM "here's a
4G zone" and "here's a 60G zone". The 60G zone is only used for
GFP_HIGHUSER allocations and is hence unpluggable.
I don't think there's any other (practical) way of implementing hot-unplug.
But hot-unplug is just an example. My main point here is that it is
desirable that we get away from the up-to-four magical hard-wired zones in
core MM.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-10-28 4:43 ` Andrew Morton
@ 2006-10-28 7:47 ` KAMEZAWA Hiroyuki
2006-10-28 16:12 ` Andi Kleen
2006-10-29 0:48 ` Christoph Lameter
2 siblings, 0 replies; 83+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-10-28 7:47 UTC (permalink / raw)
To: Andrew Morton; +Cc: clameter, nickpiggin, linux-mm
On Fri, 27 Oct 2006 21:43:24 -0700
Andrew Morton <akpm@osdl.org> wrote:
> On Fri, 27 Oct 2006 19:31:20 -0700 (PDT)
> Christoph Lameter <clameter@sgi.com> wrote:
> > > So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
> > > that it is safe to satisfy this request from removeable memory.
> >
> > OK this works on i386 but most other platforms wont have a highmem
> > zone.
>
> Under this proposal platforms which wish to implement physical hot-unplug
> would need to effectively implement highmem. They won't keep to kmap the
> pages to access their contents, but they will need to ensure that
> unreclaimable allocations be constrained to the non-removable physical
> memory.
>
> It's all pretty simple. But it'd be hacky to implement it in terms of
> "highmem". It would be better if we could just tell the core MM "here's a
> 4G zone" and "here's a 60G zone". The 60G zone is only used for
> GFP_HIGHUSER allocations and is hence unpluggable.
>
> I don't think there's any other (practical) way of implementing hot-unplug.
>
Thank you for mentioning to memory-unplug. I was offlined.
We (memory unplug collegues) tried dividing pgdat/zone/free_list for reclaimable
memory. but all of them were rejected ;). IMHO, using zone was the simplest one.
But hard-coded ZONE_EASYRECLAIM was not good looking..
I and Goto-san are still trying to improve sparsemem and *memory-hot-add*.
So, memory-unplug stops but is not dead project.
> But hot-unplug is just an example. My main point here is that it is
> desirable that we get away from the up-to-four magical hard-wired zones in
> core MM.
>
Hmm..zones should be dynamically defined at boot and configure how-to-zoning ?
or just configurable at make ?
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 4:43 ` Andrew Morton
2006-10-28 7:47 ` KAMEZAWA Hiroyuki
@ 2006-10-28 16:12 ` Andi Kleen
2006-10-29 0:48 ` Christoph Lameter
2 siblings, 0 replies; 83+ messages in thread
From: Andi Kleen @ 2006-10-28 16:12 UTC (permalink / raw)
To: Andrew Morton; +Cc: Christoph Lameter, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
> It's all pretty simple. But it'd be hacky to implement it in terms of
> "highmem". It would be better if we could just tell the core MM "here's a
> 4G zone" and "here's a 60G zone". The 60G zone is only used for
> GFP_HIGHUSER allocations and is hence unpluggable.
>
> I don't think there's any other (practical) way of implementing hot-unplug.
If it's implemented this way it would be important that the boundaries
between nodes are not fixed, but tunable. Otherwise kernel memory
intensive loads might be suddenly impossible.
>
> But hot-unplug is just an example. My main point here is that it is
> desirable that we get away from the up-to-four magical hard-wired zones in
> core MM.
I mostly agree. At least GFP_DMA needs to go and replaced
with some API that gives memory masks and lets an underlying
allocator figure it out. GFP_DMA32 might still have a better case
though because those are pretty common, but ultimatively
a mask based interface is here much better too.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 4:43 ` Andrew Morton
2006-10-28 7:47 ` KAMEZAWA Hiroyuki
2006-10-28 16:12 ` Andi Kleen
@ 2006-10-29 0:48 ` Christoph Lameter
2006-10-29 1:04 ` Andrew Morton
2 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-10-29 0:48 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Fri, 27 Oct 2006, Andrew Morton wrote:
> Right. We need zones for lots and lots of things. This all comes back to
> my main point: the hardwired and magical DMA, DMA32, NORMAL and HIGHMEM
> zones don't cut it. We'd be well-served by implementing the core MM as
> just "one or more zones". The placement, sizing and *meaning* behind those
> zones is externally defined.
We (and I personally with the prezeroing patches) have been down
this road several times and did not like what we saw.
> That's all virtual machine stuff, where the "kernel"'s memory is virtual,
> not physical.
That is the case on most platforms x86_64, ia64. Kernel memory is movable
and the Virtual Iron guys have demonstrated how to do that without
additional zones.
>
> > > Userspace allocations are reclaimable: pagecache, anonymous memory. These
> > > happen to be allocated with __GFP_HIGHMEM set.
> >
> > On certain platforms yes.
>
> On _all_ platforms. See GFP_HIGHUSER.
User space allocations are movable already via page migration.
> > > So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
> > > that it is safe to satisfy this request from removeable memory.
For that we would have to have a distinction of removable memory which
wont be necessary if we use the existing mappings to move the physical
location while keeping the virtual addresses.
> I don't think there's any other (practical) way of implementing hot-unplug.
Of course there is. As soon as you have virtual mappings its fairly easy
to do.
1. Migrate all what you can off the memory section that you want to
free.
2. Use the page table to dynamically remap the leftover pages.
> But hot-unplug is just an example. My main point here is that it is
> desirable that we get away from the up-to-four magical hard-wired zones in
> core MM.
We have been facing that decision repeatedly and it was pretty clear that
there would be significant disadvantages.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-29 0:48 ` Christoph Lameter
@ 2006-10-29 1:04 ` Andrew Morton
2006-10-29 1:29 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-10-29 1:04 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Sat, 28 Oct 2006 17:48:40 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> On Fri, 27 Oct 2006, Andrew Morton wrote:
>
> > Right. We need zones for lots and lots of things. This all comes back to
> > my main point: the hardwired and magical DMA, DMA32, NORMAL and HIGHMEM
> > zones don't cut it. We'd be well-served by implementing the core MM as
> > just "one or more zones". The placement, sizing and *meaning* behind those
> > zones is externally defined.
>
> We (and I personally with the prezeroing patches) have been down
> this road several times and did not like what we saw.
Details?
> > That's all virtual machine stuff, where the "kernel"'s memory is virtual,
> > not physical.
>
> That is the case on most platforms x86_64, ia64. Kernel memory is movable
It is?
> and the Virtual Iron guys have demonstrated how to do that without
> additional zones.
How?
> >
> > > > Userspace allocations are reclaimable: pagecache, anonymous memory. These
> > > > happen to be allocated with __GFP_HIGHMEM set.
> > >
> > > On certain platforms yes.
> >
> > On _all_ platforms. See GFP_HIGHUSER.
>
> User space allocations are movable already via page migration.
Of course.
> > > > So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
> > > > that it is safe to satisfy this request from removeable memory.
>
> For that we would have to have a distinction of removable memory which
> wont be necessary if we use the existing mappings to move the physical
> location while keeping the virtual addresses.
You're proposing that all kernel memory be virtually mapped?
I've never seen such a proposal nor any implementation.
Or maybe you're referring to something else. Please let's stop playing
question-and-answer. Please provide sufficient information so that people
can understand what you're saying.
> > I don't think there's any other (practical) way of implementing hot-unplug.
>
> Of course there is. As soon as you have virtual mappings its fairly easy
> to do.
>
> 1. Migrate all what you can off the memory section that you want to
> free.
>
> 2. Use the page table to dynamically remap the leftover pages.
I've never ever seen anyone propose that all kernel memory be virtually
mapped. I don't know what you're talking about. Please provide all
details.
> > But hot-unplug is just an example. My main point here is that it is
> > desirable that we get away from the up-to-four magical hard-wired zones in
> > core MM.
>
> We have been facing that decision repeatedly and it was pretty clear that
> there would be significant disadvantages.
Again. On the whole, that was a pretty useless email. Please give us
something we can use.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-29 1:04 ` Andrew Morton
@ 2006-10-29 1:29 ` Christoph Lameter
2006-10-29 11:32 ` Nick Piggin
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-10-29 1:29 UTC (permalink / raw)
To: Andrew Morton; +Cc: Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Sat, 28 Oct 2006, Andrew Morton wrote:
> > We (and I personally with the prezeroing patches) have been down
> > this road several times and did not like what we saw.
>
> Details?
The most important issues that come to my mind right now (this has
been discussed frequently in various contexts so I may be missing
some things) are:
1. Duplicate the caches (pageset structures). This reduces cache hit
rates. Duplicates lots of information in the page allocator.
2. Necessity of additional load balancing across multiple zones.
3. The NUMA layer can only support memory policies for a single zone.
4. You may have to duplicate the slab allocator caches for that
purpose.
5. More bits used in the page flags.
6. ZONES have to be sized at bootup which creates more dangers of runinng
out of memory, possibly requiring more complex load balancing.
7. Having more zones increases fragmentation since the different zones
have separate freelists.
> > For that we would have to have a distinction of removable memory which
> > wont be necessary if we use the existing mappings to move the physical
> > location while keeping the virtual addresses.
>
> You're proposing that all kernel memory be virtually mapped?
>
> I've never seen such a proposal nor any implementation.
It has been that way for years on ia64 and x86_64 also has virtual maps
for all of kernel memory. x86_64 currently uses huge page entries for
the kernel (arch/x86_64/mm/init.c). ia64 has a special TLB entry generator
in arch/ia64/kernel/ivt.S. I assume that other arches do the same. I have
hacked the ia64 TLB entry generator for variable kernel page sizes (see
my memmap patches posted a while back on linux-ia64).
> Or maybe you're referring to something else. Please let's stop playing
> question-and-answer. Please provide sufficient information so that people
> can understand what you're saying.
In the case of x86_64 it is possible to drain pages from an area and then
switch from a huge mapping to page size mappings for the leftover pages by
creating the lower layer pte pages. Then these can be moved individually
if we can stop kernel accesses (need to have a quiescent state on all
processors for this IPI?) while switching the ptes.
AFAIK Virtual iron (last years OLS) simply used a virtual mapping for node
unplug. They drained all the memory via swap and then creates a husk that
contained the remaining pages relocated to nodes still in use (I think
they called it a Zombie node which continued to exist while pages were
remaining or until the node was brought up again).
> Again. On the whole, that was a pretty useless email. Please give us
> something we can use.
Well review the discussions that we had regarding Mel Gorman's defrag
approaches. We discussed this in detail at the VM summit and decided to
not create additional zones but instead separate the free lists. You and
Linus seemed to be in agreement with this. I am a bit surprised ....
Is this a Google effect?
Moreover the discussion here is only remotely connected to the issue at
hand. We all agree that ZONE_DMA is bad and we want to have an alternate
scheme. Why not continue making it possible to not compile ZONE_DMA
dependent code into the kernel?
Single zone patches would increase VM performance. That would in turn
make it more difficult to get approaches in that require multiple zones
since the performance drop would be more significant.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-29 1:29 ` Christoph Lameter
@ 2006-10-29 11:32 ` Nick Piggin
2006-10-30 16:41 ` Christoph Lameter
2006-11-01 18:26 ` Mel Gorman
0 siblings, 2 replies; 83+ messages in thread
From: Nick Piggin @ 2006-10-29 11:32 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andrew Morton, KAMEZAWA Hiroyuki, linux-mm
Christoph Lameter wrote:
> On Sat, 28 Oct 2006, Andrew Morton wrote:
>
>
>>>We (and I personally with the prezeroing patches) have been down
>>>this road several times and did not like what we saw.
>>
>>Details?
>
>
> The most important issues that come to my mind right now (this has
> been discussed frequently in various contexts so I may be missing
> some things) are:
>
> 1. Duplicate the caches (pageset structures). This reduces cache hit
> rates. Duplicates lots of information in the page allocator.
You would have to do the same thing to get an O(1) per-CPU allocation
for a specific zone/reclaim type/etc regardless whether or not you use
zones.
> 2. Necessity of additional load balancing across multiple zones.
a. we have to do this anyway for eg. dma32 and NUMA, and b. it is much
better than the highmem problem was because all the memory is kernel
addressable.
If you use another scheme (eg. lists within zones within nodes, rather
than just more zones within nodes), then you still fundamentally have
to balance somehow.
> 3. The NUMA layer can only support memory policies for a single zone.
That's broken. The VM had zones long before it had nodes or memory
policies.
> 4. You may have to duplicate the slab allocator caches for that
> purpose.
If you want specific allocations from a given zone, yes. So you may
have to do the same if you want a specific slab allcoation from a
list within a zone.
> 5. More bits used in the page flags.
Aren't there patches to move the bits out of the page flags? A list
within zones approach would have to use either page flags or some
external info (eg. page pfn) to determine what list for the page to
go back to anyway, wouldn't you?
> 6. ZONES have to be sized at bootup which creates more dangers of runinng
> out of memory, possibly requiring more complex load balancing.
Mel's list based defrag approach requires complex load balancing too.
>>Again. On the whole, that was a pretty useless email. Please give us
>>something we can use.
>
>
> Well review the discussions that we had regarding Mel Gorman's defrag
> approaches. We discussed this in detail at the VM summit and decided to
> not create additional zones but instead separate the free lists. You and
> Linus seemed to be in agreement with this. I am a bit surprised ....
> Is this a Google effect?
>
> Moreover the discussion here is only remotely connected to the issue at
> hand. We all agree that ZONE_DMA is bad and we want to have an alternate
> scheme. Why not continue making it possible to not compile ZONE_DMA
> dependent code into the kernel?
>
> Single zone patches would increase VM performance. That would in turn
> make it more difficult to get approaches in that require multiple zones
> since the performance drop would be more significant.
node->zone->many lists vs node->many zones? I guess the zones approach is
faster?
Not that I am any more convinced that defragmentation is a good idea than
I was a year ago, but I think it is naive to think we can instantly be rid
of all the problems associated with zones by degenerating that layer of the
VM and introducing a new one that does basically the same things.
It is true that zones may not be a perfect fit for what some people want to
do, but until they have shown a) what they want to do is a good idea, and
b) zones can't easily be adapted, then using the infrastructure we already
have throughout the entire mm seems like a good idea.
IMO, Andrew's idea to have 1..N zones in a node seems sane and it would be
a good generalisation of even the present code.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-29 11:32 ` Nick Piggin
@ 2006-10-30 16:41 ` Christoph Lameter
2006-11-01 18:26 ` Mel Gorman
1 sibling, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-10-30 16:41 UTC (permalink / raw)
To: Nick Piggin; +Cc: Andrew Morton, KAMEZAWA Hiroyuki, linux-mm
On Sun, 29 Oct 2006, Nick Piggin wrote:
> > 1. Duplicate the caches (pageset structures). This reduces cache hit
> > rates. Duplicates lots of information in the page allocator.
>
> You would have to do the same thing to get an O(1) per-CPU allocation
> for a specific zone/reclaim type/etc regardless whether or not you use
> zones.
Duplicate caches reduce the hitrate of the cache and if there are
fluctuating usage scenarios then the cache may run cold,
> > 2. Necessity of additional load balancing across multiple zones.
>
> a. we have to do this anyway for eg. dma32 and NUMA, and b. it is much
> better than the highmem problem was because all the memory is kernel
> addressable.
Yes we have that but this is going to be more complex in the future if we
add additional zones. We dont need it with a single zone.
> If you use another scheme (eg. lists within zones within nodes, rather
> than just more zones within nodes), then you still fundamentally have
> to balance somehow.
The single zone scheme does not need this.
> > 3. The NUMA layer can only support memory policies for a single zone.
>
> That's broken. The VM had zones long before it had nodes or memory
> policies
NUMA nodes mostly only have one zone (ZONE_NORMAL on 64 bit and
ZONE_HIGHMEM on 32 bit). The only exception are low nodes (node 0 or 1?)
that may have additional DMA zones in some configurations.
> > 4. You may have to duplicate the slab allocator caches for that
> > purpose.
>
> If you want specific allocations from a given zone, yes. So you may
> have to do the same if you want a specific slab allcoation from a
> list within a zone.
I am still not sure what the lists within a zone are for? The proposal
was to reduce zones and not create additional lists.
> node->zone->many lists vs node->many zones? I guess the zones approach is
> faster?
No. Node->many_zone->freelist vs. node->one_zone-?_one_freelist in the regular case.
For Mel's defrag scheme one would need to add new lists but
then this will introduce more fragmentation in order to fix the
fragmentation issue. Still having lists within a zone would avoid the boot
up sizing of zones and avoid additional page flags.
> Not that I am any more convinced that defragmentation is a good idea than
> I was a year ago, but I think it is naive to think we can instantly be rid
> of all the problems associated with zones by degenerating that layer of the
> VM and introducing a new one that does basically the same things.
I am also having the same concerns. Going from multiple zones to one zone
is a performance benefit in many cases. In the NUMA case (if you have more
than a few nodes) most nodes only have one zone anyways.
> It is true that zones may not be a perfect fit for what some people want to
> do, but until they have shown a) what they want to do is a good idea, and
> b) zones can't easily be adapted, then using the infrastructure we already
> have throughout the entire mm seems like a good idea.
I have never said that people cannot add zones. But this is usually not
necessary. The intend here is to optimize for the case that we only have
one zone. Single zone configurations will have a smaller VM with less
cache footprint and run faster.
> IMO, Andrew's idea to have 1..N zones in a node seems sane and it would be
> a good generalisation of even the present code.
We already have multiple zones, and it is fairly easy to add a zone. If
someone has an idea how to generalize this then please do so. I do not see
how that could be done given the different usage scenarios for the various
zones.
But why is not okay to optimize the kernel for the one zone situation?
I prefer a simple, small and fast VM and this only optimizing the VM by
not compiling code that is only needed for configurations that require
multiple zones.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-29 11:32 ` Nick Piggin
2006-10-30 16:41 ` Christoph Lameter
@ 2006-11-01 18:26 ` Mel Gorman
2006-11-01 20:34 ` Andrew Morton
1 sibling, 1 reply; 83+ messages in thread
From: Mel Gorman @ 2006-11-01 18:26 UTC (permalink / raw)
To: Nick Piggin; +Cc: Christoph Lameter, Andrew Morton, KAMEZAWA Hiroyuki, linux-mm
On (29/10/06 22:32), Nick Piggin didst pronounce:
> Christoph Lameter wrote:
> >On Sat, 28 Oct 2006, Andrew Morton wrote:
> >
> >
> >>>We (and I personally with the prezeroing patches) have been down
> >>>this road several times and did not like what we saw.
> >>
> >>Details?
> >
> >
> >The most important issues that come to my mind right now (this has
> >been discussed frequently in various contexts so I may be missing
> >some things) are:
> >
> >1. Duplicate the caches (pageset structures). This reduces cache hit
> > rates. Duplicates lots of information in the page allocator.
>
> You would have to do the same thing to get an O(1) per-CPU allocation
> for a specific zone/reclaim type/etc regardless whether or not you use
> zones.
>
> >2. Necessity of additional load balancing across multiple zones.
>
> a. we have to do this anyway for eg. dma32 and NUMA, and b. it is much
> better than the highmem problem was because all the memory is kernel
> addressable.
>
> If you use another scheme (eg. lists within zones within nodes, rather
> than just more zones within nodes), then you still fundamentally have
> to balance somehow.
>
> >3. The NUMA layer can only support memory policies for a single zone.
>
> That's broken. The VM had zones long before it had nodes or memory
> policies.
>
> >4. You may have to duplicate the slab allocator caches for that
> > purpose.
>
> If you want specific allocations from a given zone, yes. So you may
> have to do the same if you want a specific slab allcoation from a
> list within a zone.
>
> >5. More bits used in the page flags.
>
> Aren't there patches to move the bits out of the page flags? A list
> within zones approach would have to use either page flags or some
> external info (eg. page pfn) to determine what list for the page to
> go back to anyway, wouldn't you?
>
> >6. ZONES have to be sized at bootup which creates more dangers of runinng
> > out of memory, possibly requiring more complex load balancing.
>
> Mel's list based defrag approach requires complex load balancing too.
>
I never really got this objection. With list-based anti-frag, the
zone-balancing logic remains the same. There are patches from Andy
Whitcroft that reclaims pages in contiguous blocks, but still with the same
zone-ordering. It doesn't affect load balancing between zones as such.
With zone-based anti-fragmentation, the load balancing was a bit more
entertaining all right.
In the context of memory hot-unplug though, list-based anti-fragmentation
only really helps you if you can unplug regions of size MAX_ORDER_NR_PAGES. If
you go over that, you need zones.
> >>Again. On the whole, that was a pretty useless email. Please give us
> >>something we can use.
> >
> >
> >Well review the discussions that we had regarding Mel Gorman's defrag
> >approaches. We discussed this in detail at the VM summit and decided to
> >not create additional zones but instead separate the free lists. You and
> >Linus seemed to be in agreement with this. I am a bit surprised ....
> >Is this a Google effect?
> >
> >Moreover the discussion here is only remotely connected to the issue at
> >hand. We all agree that ZONE_DMA is bad and we want to have an alternate
> >scheme. Why not continue making it possible to not compile ZONE_DMA
> >dependent code into the kernel?
> >
> >Single zone patches would increase VM performance. That would in turn
> >make it more difficult to get approaches in that require multiple zones
> >since the performance drop would be more significant.
>
> node->zone->many lists vs node->many zones? I guess the zones approach is
> faster?
>
Not really. If I have a zone with two sets of free lists or two zones with
one set of free lists each, there are the same number of lists. However, for
anti-fragmentation with additional lists, you frequently use the preferred list
because they size themselves based on allocator usage patterns. With zones,
you *must* get the zone sizes right or the performance hit for zone
fallbacks starts becoming noticeable.
> Not that I am any more convinced that defragmentation is a good idea than
> I was a year ago, but I think it is naive to think we can instantly be rid
> of all the problems associated with zones by degenerating that layer of the
> VM and introducing a new one that does basically the same things.
>
> It is true that zones may not be a perfect fit for what some people want to
> do, but until they have shown a) what they want to do is a good idea, and
> b) zones can't easily be adapted, then using the infrastructure we already
> have throughout the entire mm seems like a good idea.
>
> IMO, Andrew's idea to have 1..N zones in a node seems sane and it would be
> a good generalisation of even the present code.
>
> --
> SUSE Labs, Novell Inc.
> Send instant messages to your online friends http://au.messenger.yahoo.com
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 18:26 ` Mel Gorman
@ 2006-11-01 20:34 ` Andrew Morton
2006-11-01 21:00 ` Christoph Lameter
2006-11-01 22:10 ` Mel Gorman
0 siblings, 2 replies; 83+ messages in thread
From: Andrew Morton @ 2006-11-01 20:34 UTC (permalink / raw)
To: Mel Gorman; +Cc: Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006 18:26:05 +0000
mel@skynet.ie (Mel Gorman) wrote:
> I never really got this objection. With list-based anti-frag, the
> zone-balancing logic remains the same. There are patches from Andy
> Whitcroft that reclaims pages in contiguous blocks, but still with the same
> zone-ordering. It doesn't affect load balancing between zones as such.
I do believe that lumpy-reclaim (initiated by Andy, redone and prototyped
by Peter, cruelly abandoned) is a perferable approach to solving the
fragmentation approach.
And with __GFP_EASYRECLAIM (please - I just renamed it ;)) (or using
__GFP_HIGHMEM for the same thing) then some of the core lumpy-reclaim
algorithm can be reused for hot-unplug.
If you want to unplug a range of memory then it has to be in a zone which
is 100% __GFP_EASY_RECLAIM (actually the name is still wrong. It should
just be __GFP_RECLAIMABLE).
The hot-unplug code will go through those pages and it will, with 100%
reliability, rip those pages out of the kernel via various means. I think
this can all be done.
And hot-unplug isn't actually the interesting application. Modern Intel
memory controllers apparently have (or will have) the ability to power down
DIMMs.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 20:34 ` Andrew Morton
@ 2006-11-01 21:00 ` Christoph Lameter
2006-11-01 21:46 ` Andrew Morton
2006-11-01 22:13 ` Mel Gorman
2006-11-01 22:10 ` Mel Gorman
1 sibling, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-01 21:00 UTC (permalink / raw)
To: Andrew Morton; +Cc: Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006, Andrew Morton wrote:
> And hot-unplug isn't actually the interesting application. Modern Intel
> memory controllers apparently have (or will have) the ability to power down
> DIMMs.
Plus one would want to be able to move memory out of an area where we may
have a bad DIMM. If we monitor soft ECC failures then we could also
judge a DIMM to be bad if we have a too high soft failure rate.
If there is a hard failure and we can recover (page cache page f.e.)
then we could preemptively disable the complete DIMM.
I still think that we need to generalize the approach to be
able to cover as much memory as possible. Remapping can solve some of the
issues, for others we could add additional ways to make things movable.
F.e. one could make page table pages movable by adding a back pointer to
the mm, reclaimable slab pages by adding a move function, driver
allocations could have a backpointer to the driver that would be able to
move its memory. Hmm.... Maybe generally a way to provide a
function to move data in the page struct for kernel allocations?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 21:00 ` Christoph Lameter
@ 2006-11-01 21:46 ` Andrew Morton
2006-11-01 21:50 ` Christoph Lameter
2006-11-01 22:13 ` Mel Gorman
1 sibling, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-11-01 21:46 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006 13:00:55 -0800 (PST)
Christoph Lameter <clameter@sgi.com> wrote:
> On Wed, 1 Nov 2006, Andrew Morton wrote:
>
> > And hot-unplug isn't actually the interesting application. Modern Intel
> > memory controllers apparently have (or will have) the ability to power down
> > DIMMs.
>
> Plus one would want to be able to move memory out of an area where we may
> have a bad DIMM. If we monitor soft ECC failures then we could also
> judge a DIMM to be bad if we have a too high soft failure rate.
>
> If there is a hard failure and we can recover (page cache page f.e.)
> then we could preemptively disable the complete DIMM.
Point.
> I still think that we need to generalize the approach to be
> able to cover as much memory as possible. Remapping can solve some of the
> issues, for others we could add additional ways to make things movable.
> F.e. one could make page table pages movable by adding a back pointer to
> the mm, reclaimable slab pages by adding a move function, driver
> allocations could have a backpointer to the driver that would be able to
> move its memory. Hmm.... Maybe generally a way to provide a
> function to move data in the page struct for kernel allocations?
Sounds hard. That's all Version 2 ;)
(For example, I do recall working out that going from a slab-page up to a
buffer_head and then reclaiming that buffer_head is basically impossible
from a locking POV, but I forget the details..)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 21:46 ` Andrew Morton
@ 2006-11-01 21:50 ` Christoph Lameter
0 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-01 21:50 UTC (permalink / raw)
To: Andrew Morton; +Cc: Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006, Andrew Morton wrote:
> Sounds hard. That's all Version 2 ;)
Well if we wont start small then well never get to it. I had hoped that
the page migration patches would make the hotplug folks start such
efforts. Start with the simple and then work the way up.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 21:00 ` Christoph Lameter
2006-11-01 21:46 ` Andrew Morton
@ 2006-11-01 22:13 ` Mel Gorman
2006-11-01 23:29 ` Christoph Lameter
1 sibling, 1 reply; 83+ messages in thread
From: Mel Gorman @ 2006-11-01 22:13 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006, Christoph Lameter wrote:
> On Wed, 1 Nov 2006, Andrew Morton wrote:
>
>> And hot-unplug isn't actually the interesting application. Modern Intel
>> memory controllers apparently have (or will have) the ability to power down
>> DIMMs.
>
> Plus one would want to be able to move memory out of an area where we may
> have a bad DIMM. If we monitor soft ECC failures then we could also
> judge a DIMM to be bad if we have a too high soft failure rate.
>
For this, it'd be desirable to be able to marge a range of pages as
unusable. In the anti-frag patches I posted, I included a mechanism for
having flags that affected a whole block of pages. One intent in the
future was to be able to mark a whole block of pages as getting reclaimed
for the allocation of superpages.
The same mechanism could be used to mark pages as being offlined so you
could mark a DIMM as offlined and start reclaiming in there knowing it can
be unplugged some time in the future.
> If there is a hard failure and we can recover (page cache page f.e.)
> then we could preemptively disable the complete DIMM.
>
> I still think that we need to generalize the approach to be
> able to cover as much memory as possible. Remapping can solve some of the
> issues, for others we could add additional ways to make things movable.
> F.e. one could make page table pages movable by adding a back pointer to
> the mm, reclaimable slab pages by adding a move function, driver
> allocations could have a backpointer to the driver that would be able to
> move its memory.
I got the impression that we wouldn't be allowed to introduce such a
mechanism because driver writers would get it wrong. It was why proper
defragmentation was never really implemented.
> Hmm.... Maybe generally a way to provide a
> function to move data in the page struct for kernel allocations?
>
As devices are able to get physical addresses which then get pinned for
IO, it gets messy.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 22:13 ` Mel Gorman
@ 2006-11-01 23:29 ` Christoph Lameter
2006-11-02 0:22 ` Andrew Morton
2006-11-02 12:45 ` Mel Gorman
0 siblings, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-01 23:29 UTC (permalink / raw)
To: Mel Gorman
Cc: Andrew Morton, Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006, Mel Gorman wrote:
> > I still think that we need to generalize the approach to be
> > able to cover as much memory as possible. Remapping can solve some of the
> > issues, for others we could add additional ways to make things movable.
> > F.e. one could make page table pages movable by adding a back pointer to
> > the mm, reclaimable slab pages by adding a move function, driver
> > allocations could have a backpointer to the driver that would be able to
> > move its memory.
>
> I got the impression that we wouldn't be allowed to introduce such a mechanism
> because driver writers would get it wrong. It was why proper defragmentation
> was never really implemented.
I think that choice is better than fiddling with the VM by adding
additional zones which will introduce lots of other problems.
The ability to move memory in general is beneficial for many purposes.
Defragmentation is certainly one of them. If all memory would be movable
then you would not need the separate list in the zone either.
Maybe we can have special mempools for unreclaimable
allocations for starters and with that have the rest of memory be
movable? Then we can gradually reduce the need for unreclaimable memory.
Maybe we can keep unmovable memory completely out of the page allocator?
With that approach we would not break the NUMA layer because we can keep
the one zone per node approach for memory policies. The special
unreclaimable memory would not obey memory policies (which makes sense
since device driverws do not want user space policies applied to their
allocations anyways. Device drivers need memory near the device).
> > Hmm.... Maybe generally a way to provide a
> > function to move data in the page struct for kernel allocations?
> >
> As devices are able to get physical addresses which then get pinned for IO, it
> gets messy.
Right. So the device needs to disengage and then move its structures.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 23:29 ` Christoph Lameter
@ 2006-11-02 0:22 ` Andrew Morton
2006-11-02 0:27 ` Christoph Lameter
2006-11-02 12:45 ` Mel Gorman
1 sibling, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-11-02 0:22 UTC (permalink / raw)
To: Christoph Lameter
Cc: Mel Gorman, Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006 15:29:11 -0800 (PST)
Christoph Lameter <clameter@sgi.com> wrote:
> On Wed, 1 Nov 2006, Mel Gorman wrote:
>
> > > I still think that we need to generalize the approach to be
> > > able to cover as much memory as possible. Remapping can solve some of the
> > > issues, for others we could add additional ways to make things movable.
> > > F.e. one could make page table pages movable by adding a back pointer to
> > > the mm, reclaimable slab pages by adding a move function, driver
> > > allocations could have a backpointer to the driver that would be able to
> > > move its memory.
> >
> > I got the impression that we wouldn't be allowed to introduce such a mechanism
> > because driver writers would get it wrong. It was why proper defragmentation
> > was never really implemented.
>
> I think that choice is better than fiddling with the VM by adding
> additional zones which will introduce lots of other problems.
What lots of other problems? 64x64MB zones works good.
> The ability to move memory in general is beneficial for many purposes.
> Defragmentation is certainly one of them. If all memory would be movable
> then you would not need the separate list in the zone either.
>
> Maybe we can have special mempools for unreclaimable
> allocations for starters and with that have the rest of memory be
> movable? Then we can gradually reduce the need for unreclaimable memory.
> Maybe we can keep unmovable memory completely out of the page allocator?
>
> With that approach we would not break the NUMA layer because we can keep
> the one zone per node approach for memory policies. The special
> unreclaimable memory would not obey memory policies (which makes sense
> since device driverws do not want user space policies applied to their
> allocations anyways. Device drivers need memory near the device).
>
> > > Hmm.... Maybe generally a way to provide a
> > > function to move data in the page struct for kernel allocations?
> > >
> > As devices are able to get physical addresses which then get pinned for IO, it
> > gets messy.
>
> Right. So the device needs to disengage and then move its structures.
I don't think we have a snowball's chance of making all kernel memory
relocatable. Or even a useful amount of it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 0:22 ` Andrew Morton
@ 2006-11-02 0:27 ` Christoph Lameter
0 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-02 0:27 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On Wed, 1 Nov 2006, Andrew Morton wrote:
> > I think that choice is better than fiddling with the VM by adding
> > additional zones which will introduce lots of other problems.
>
> What lots of other problems? 64x64MB zones works good.
Read my earlier mail on this. Certainly you can make this work for a
specialized load that does not use all kernel features.
> > Right. So the device needs to disengage and then move its structures.
>
> I don't think we have a snowball's chance of making all kernel memory
> relocatable. Or even a useful amount of it.
In the simplest case the device would close down free all of its memory
and then start up again reallocating necessary memory?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 23:29 ` Christoph Lameter
2006-11-02 0:22 ` Andrew Morton
@ 2006-11-02 12:45 ` Mel Gorman
1 sibling, 0 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-02 12:45 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List
On Wed, 1 Nov 2006, Christoph Lameter wrote:
> On Wed, 1 Nov 2006, Mel Gorman wrote:
>
>>> I still think that we need to generalize the approach to be
>>> able to cover as much memory as possible. Remapping can solve some of the
>>> issues, for others we could add additional ways to make things movable.
>>> F.e. one could make page table pages movable by adding a back pointer to
>>> the mm, reclaimable slab pages by adding a move function, driver
>>> allocations could have a backpointer to the driver that would be able to
>>> move its memory.
>>
>> I got the impression that we wouldn't be allowed to introduce such a mechanism
>> because driver writers would get it wrong. It was why proper defragmentation
>> was never really implemented.
>
> I think that choice is better than fiddling with the VM by adding
> additional zones which will introduce lots of other problems.
>
The amount of work required to overhawl the device driver model and then
update each driver makes the suspend problem look straight-forward.
> The ability to move memory in general is beneficial for many purposes.
> Defragmentation is certainly one of them. If all memory would be movable
> then you would not need the separate list in the zone either.
>
If we had that ability, sure, but it's pie-in-the-sky. Page migration as
it is can move easily reclaimable pages. With list-based anti-frag
clustering non-movable allocations together as much as possible, I could
use the existing page migration infrastructure to keep MAX_ORDER_NR_PAGES
blocks of pages containing kernel pages free of easily-reclaimed
allocations and vastly improve the resizing of the huge pages pool at
runtime which is my *primary* concern, not memory hot unplug.
If hot unplug became a big issue again, __rmqueue_fallback() from the
list-based anti-frag patches could be taught how to place kernel
non-reclaimable (different to short-lived kernel allocation or caches)
blocks of pages always at the lower PFNs and migrate existing
easily-reclaimed pages out of there without adding zones. This would allow
hot unplug of higher PFN ranges. This would be easier than making all
kernel memory movable.
With the additional lists in the buddy allocator as well, I believe we
could get rid of the special code managing hugepage pools and put it all
back into the buddy allocator as an additional list that is only used for
hugepage allocations. That would remove some hugepage-specific code for
page allocation.
If I can get list-based anti-frag merged into a testing tree for a while,
I can start working on this sort of thing properly as well as finding out
for sure if anti-frag helps the resizing of the huge page pool or not.
> Maybe we can have special mempools for unreclaimable
> allocations for starters and with that have the rest of memory be
> movable? Then we can gradually reduce the need for unreclaimable memory.
> Maybe we can keep unmovable memory completely out of the page allocator?
>
Mempools that grow or shrink in MAX_ORDER_NR_PAGES blocks is an option
although balancing gets tricky. You'd have pages that are
free-but-not-free stuck in there.
> With that approach we would not break the NUMA layer because we can keep
> the one zone per node approach for memory policies. The special
> unreclaimable memory would not obey memory policies (which makes sense
> since device driverws do not want user space policies applied to their
> allocations anyways. Device drivers need memory near the device).
>
>>> Hmm.... Maybe generally a way to provide a
>>> function to move data in the page struct for kernel allocations?
>>>
>> As devices are able to get physical addresses which then get pinned for IO, it
>> gets messy.
>
> Right. So the device needs to disengage and then move its structures.
>
That is far easier said than done.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 20:34 ` Andrew Morton
2006-11-01 21:00 ` Christoph Lameter
@ 2006-11-01 22:10 ` Mel Gorman
2006-11-02 17:37 ` Andy Whitcroft
2006-11-02 18:52 ` Andrew Morton
1 sibling, 2 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-01 22:10 UTC (permalink / raw)
To: Andrew Morton
Cc: Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki,
Andy Whitcroft, Linux Memory Management List
On Wed, 1 Nov 2006, Andrew Morton wrote:
> On Wed, 1 Nov 2006 18:26:05 +0000
> mel@skynet.ie (Mel Gorman) wrote:
>
>> I never really got this objection. With list-based anti-frag, the
>> zone-balancing logic remains the same. There are patches from Andy
>> Whitcroft that reclaims pages in contiguous blocks, but still with the same
>> zone-ordering. It doesn't affect load balancing between zones as such.
>
> I do believe that lumpy-reclaim (initiated by Andy, redone and prototyped
> by Peter, cruelly abandoned) is a perferable approach to solving the
> fragmentation approach.
>
On it's own lumpy-reclaim or linear-reclaim were not enough to get
MAX_ORDER_NR_PAGES blocks of contiguous pages and these were of interest
for huge pages although not necessarily of much use to memory hot-unplug.
Tests with linear reclaim and lumpy reclaim showed them to be marginally
(very marginal) better than just using the standard allocator and standard
reclaim. The clustering by reclaim type (or having a separate zone) was
still needed.
> And with __GFP_EASYRECLAIM (please - I just renamed it ;))
Sure.
> (or using
> __GFP_HIGHMEM for the same thing)
>From a fragmentation perspective, __GFP_HIGHUSER on it's own was not
enough. Block device pages for example or pages allocated by submit_bh()
are largely reclaimable but not allocated with __GFP_HIGHUSER.
> then some of the core lumpy-reclaim algorithm can be reused for hot-unplug.
>
> If you want to unplug a range of memory then it has to be in a zone which
> is 100% __GFP_EASY_RECLAIM (actually the name is still wrong. It should
> just be __GFP_RECLAIMABLE).
>
The "EASY" was in the title becauseI named kernel allocations that were
short-lived or belonging to caches KERNRCLM or KERNEL_RECLAIMABLE now I
suppose. It made a difference to how effective list-based anti-frag was
under pressure.
> The hot-unplug code will go through those pages and it will, with 100%
> reliability, rip those pages out of the kernel via various means. I think
> this can all be done.
>
The unplug code used to exist and I recall being able to offline memory
and bring it back online again.
> And hot-unplug isn't actually the interesting application. Modern Intel
> memory controllers apparently have (or will have) the ability to power down
> DIMMs.
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 22:10 ` Mel Gorman
@ 2006-11-02 17:37 ` Andy Whitcroft
2006-11-02 18:08 ` Christoph Lameter
2006-11-02 18:52 ` Andrew Morton
1 sibling, 1 reply; 83+ messages in thread
From: Andy Whitcroft @ 2006-11-02 17:37 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
Mel Gorman wrote:
> On Wed, 1 Nov 2006, Andrew Morton wrote:
>
>> On Wed, 1 Nov 2006 18:26:05 +0000
>> mel@skynet.ie (Mel Gorman) wrote:
>>
>>> I never really got this objection. With list-based anti-frag, the
>>> zone-balancing logic remains the same. There are patches from Andy
>>> Whitcroft that reclaims pages in contiguous blocks, but still with
>>> the same
>>> zone-ordering. It doesn't affect load balancing between zones as such.
>>
>> I do believe that lumpy-reclaim (initiated by Andy, redone and prototyped
>> by Peter, cruelly abandoned) is a perferable approach to solving the
>> fragmentation approach.
>>
Heh, I've talked to Peter and apologised for its apparent abandonment.
In fact the problem is that a huge amount of time has been consumed
papering over the cracks in the last few releases; I for one feel this
has been the most unstable "merge window" we've ever had.
> On it's own lumpy-reclaim or linear-reclaim were not enough to get
> MAX_ORDER_NR_PAGES blocks of contiguous pages and these were of interest
> for huge pages although not necessarily of much use to memory
> hot-unplug. Tests with linear reclaim and lumpy reclaim showed them to
> be marginally (very marginal) better than just using the standard
> allocator and standard reclaim. The clustering by reclaim type (or
> having a separate zone) was still needed.
As Mel indicates a reclaim algorithm change is not enough. Without
thoughtful placement of the non-reclaimable kernel allocations we end up
with no reclaimable blocks regardless of algorithm. Unless we are going
to allow all pages to be reclaimed (which is a massive job of
unthinkable proportions IMO) then we need some kind of placement scheme
to aid reclaim.
To illustrate this I have pulled together some figures from some testing
we have managed to get through. All figures represent the percentage of
overall memory which could be allocated at MAX_ORDER-1 at rest after a
period of high fragmentation activity:
ppc64 x86_64
baseline 9 % 21 %
linear-reclaim-v1 9 % 21 %
linear-reclaim-v1 listbased-v26 59 % 72 %
lumpy-reclaim-v2 11 % 16 %
lumpy-reclaim-v2 listbased-v26 24 % 57 %
Also as a graph at the following URL:
http://www.shadowen.org/~apw/public/reclaim/reclaim-rates.png
The comparison between the baseline and baseline + reclaim algorithm
shows that we gain near nothing with just that change. Bring in the
placement and we see real gains.
I am currently working on a variant of lumpy reclaim to try and bridge
the gap between it and linear without losing its graceful simplicity.
-apw
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-11-02 17:37 ` Andy Whitcroft
@ 2006-11-02 18:08 ` Christoph Lameter
2006-11-02 20:58 ` Mel Gorman
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-02 18:08 UTC (permalink / raw)
To: Andy Whitcroft
Cc: Andrew Morton, Mel Gorman, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Andy Whitcroft wrote:
> with no reclaimable blocks regardless of algorithm. Unless we are going
> to allow all pages to be reclaimed (which is a massive job of
> unthinkable proportions IMO) then we need some kind of placement scheme
> to aid reclaim.
The pages clearly need to be separated according to movable and
unmovable. However, I think reclaimable needs to be the default
and some simple measures will make a significant portion of the pages that
we cannot currently move movable.
Unmovable pages need to be managed with some sort of special scheme and
are need to be kept together in a separate pool or something, They do not
need memory policy support f.e. Regular allocations should be left
unchanged and continue to be handled as is. Unmovable pages may have a
special flag or be handled in some special way.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 18:08 ` Christoph Lameter
@ 2006-11-02 20:58 ` Mel Gorman
2006-11-02 21:04 ` Christoph Lameter
2006-11-02 21:52 ` Christoph Lameter
0 siblings, 2 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-02 20:58 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Christoph Lameter wrote:
> On Thu, 2 Nov 2006, Andy Whitcroft wrote:
>
>> with no reclaimable blocks regardless of algorithm. Unless we are going
>> to allow all pages to be reclaimed (which is a massive job of
>> unthinkable proportions IMO) then we need some kind of placement scheme
>> to aid reclaim.
>
> The pages clearly need to be separated according to movable and
> unmovable. However, I think reclaimable needs to be the default
> and some simple measures will make a significant portion of the pages that
> we cannot currently move movable.
>
Ok... list-based anti-frag identified three types of pages. From the
leading mail;
EasyReclaimable - These are userspace pages that are easily reclaimable. This
flag is set when it is known that the pages will be trivially reclaimed
by writing the page out to swap or syncing with backing storage
KernelReclaimable - These are allocations for some kernel caches that are
reclaimable or allocations that are known to be very short-lived.
KernelNonReclaimable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
The EasyReclaimable and KernelReclaimable allocations are marked with
__GFP flags.
Now, you want to separate pages according to movable and unmovable.
Broadly speaking, EasyReclaimable == Movable and
KernelReclaimable+KernelNonReclaimable == Non-Movable. However, while
KernelReclaimable are Non-Movable, they can be reclaimed by purging
caches. So, if we redefined the three terms to be Movable, Reclaimable and
Non-Movable, you get the separation you are looking for at least within a
MAX_ORDER_NR_PAGES.
> Unmovable pages need to be managed with some sort of special scheme and
> are need to be kept together in a separate pool or something, They do not
> need memory policy support f.e. Regular allocations should be left
> unchanged and continue to be handled as is. Unmovable pages may have a
> special flag or be handled in some special way.
>
"Special way" to me is just "place them somewhere smart". If their
location was really important for hot-unplug, a placement policy could
always use MAX_ORDER_NR_PAGES at the lower PFNs in a zone for them. This
should be easier than introducing additional memory pools, zones or other
mechanisms.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-11-02 20:58 ` Mel Gorman
@ 2006-11-02 21:04 ` Christoph Lameter
2006-11-02 21:16 ` Mel Gorman
2006-11-02 21:52 ` Christoph Lameter
1 sibling, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-02 21:04 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Mel Gorman wrote:
> "Special way" to me is just "place them somewhere smart". If their location
> was really important for hot-unplug, a placement policy could always use
> MAX_ORDER_NR_PAGES at the lower PFNs in a zone for them. This should be easier
> than introducing additional memory pools, zones or other mechanisms.
This is going to be fine for hotplug in terms of a portion of a
node going down. However, at some point we would like to have node
plug and unplug.
An unpluggable NUMA node must only allow movable memory and all non
movable allocations will need to be redirected to other nodes.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 21:04 ` Christoph Lameter
@ 2006-11-02 21:16 ` Mel Gorman
0 siblings, 0 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-02 21:16 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Christoph Lameter wrote:
> On Thu, 2 Nov 2006, Mel Gorman wrote:
>
>> "Special way" to me is just "place them somewhere smart". If their location
>> was really important for hot-unplug, a placement policy could always use
>> MAX_ORDER_NR_PAGES at the lower PFNs in a zone for them. This should be easier
>> than introducing additional memory pools, zones or other mechanisms.
>
> This is going to be fine for hotplug in terms of a portion of a
> node going down. However, at some point we would like to have node
> plug and unplug.
>
Ok
> An unpluggable NUMA node must only allow movable memory and all non
> movable allocations will need to be redirected to other nodes.
>
That would be doable with antifrag. If a node is marked unpluggable and
gfpflags_to_rclmtype() == RCLM_NORCLM, then skip the node and fallback as
normal to the next node.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 20:58 ` Mel Gorman
2006-11-02 21:04 ` Christoph Lameter
@ 2006-11-02 21:52 ` Christoph Lameter
2006-11-02 22:37 ` Mel Gorman
2006-11-03 12:48 ` Peter Zijlstra
1 sibling, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-02 21:52 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Mel Gorman wrote:
> Ok... list-based anti-frag identified three types of pages. From the leading
> mail;
>
> EasyReclaimable - These are userspace pages that are easily reclaimable. This
> flag is set when it is known that the pages will be trivially
> reclaimed
> by writing the page out to swap or syncing with backing storage
>
> KernelReclaimable - These are allocations for some kernel caches that are
> reclaimable or allocations that are known to be very short-lived.
>
> KernelNonReclaimable - These are pages that are allocated by the kernel that
> are not trivially reclaimed. For example, the memory allocated for a
> loaded module would be in this category. By default, allocations are
> considered to be of this type
>
> The EasyReclaimable and KernelReclaimable allocations are marked with __GFP
> flags.
>
> Now, you want to separate pages according to movable and unmovable. Broadly
> speaking, EasyReclaimable == Movable and
> KernelReclaimable+KernelNonReclaimable == Non-Movable. However, while
> KernelReclaimable are Non-Movable, they can be reclaimed by purging caches.
> So, if we redefined the three terms to be Movable, Reclaimable and
> Non-Movable, you get the separation you are looking for at least within a
> MAX_ORDER_NR_PAGES.
I think talking about reclaim here is not what you want. defragmentation
is fundamentally about moving memor not reclaim. Reclaim is a way of
evicting pages from memory to avoid the move. This may be useful if memory
is filled up because defragging can then do what swapping would have to
do. However, evicting pages means that they have to be reread. Page
migration can migrate pages at 1GB/sec which is certainly much higher
than having to reread the page.
Also I think the reclaim idea breaks down in the following cases:
1. An mlocked page. This is a page that is movable but not reclaimable.
How does defrag handle that case right now? It should really move the
page if necessary.
2. There are a number of unreclaimable page types that are easily movable.
F.e. page table pages are movable if you take a write-lock on mmap_sem
and handle the tree carefully. These pages again are not reclaimable but
they are movable.
Various caching objects in the slab (cpucache align cache etc) are also
easily movable. If we put them into a separate slab cache then we could
make them movable.
Certain Device drivers may be able to shut down intermittendly releasing
their memory and reallocating it later. This also may be used to move
memory. Memory allocated by such a device driver is movable.
I would suggest to not categorize pages according to their reclaimability
but according to their movability. The decision to evict a page (reclaim)
is something that may be useful to avoid swap but it may be better to keep
pages in memory.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 21:52 ` Christoph Lameter
@ 2006-11-02 22:37 ` Mel Gorman
2006-11-02 22:50 ` Christoph Lameter
2006-11-03 12:48 ` Peter Zijlstra
1 sibling, 1 reply; 83+ messages in thread
From: Mel Gorman @ 2006-11-02 22:37 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Christoph Lameter wrote:
> On Thu, 2 Nov 2006, Mel Gorman wrote:
>
>> Ok... list-based anti-frag identified three types of pages. From the leading
>> mail;
>>
>> EasyReclaimable - These are userspace pages that are easily reclaimable. This
>> flag is set when it is known that the pages will be trivially
>> reclaimed
>> by writing the page out to swap or syncing with backing storage
>>
>> KernelReclaimable - These are allocations for some kernel caches that are
>> reclaimable or allocations that are known to be very short-lived.
>>
>> KernelNonReclaimable - These are pages that are allocated by the kernel that
>> are not trivially reclaimed. For example, the memory allocated for a
>> loaded module would be in this category. By default, allocations are
>> considered to be of this type
>>
>> The EasyReclaimable and KernelReclaimable allocations are marked with __GFP
>> flags.
>>
>> Now, you want to separate pages according to movable and unmovable. Broadly
>> speaking, EasyReclaimable == Movable and
>> KernelReclaimable+KernelNonReclaimable == Non-Movable. However, while
>> KernelReclaimable are Non-Movable, they can be reclaimed by purging caches.
>> So, if we redefined the three terms to be Movable, Reclaimable and
>> Non-Movable, you get the separation you are looking for at least within a
>> MAX_ORDER_NR_PAGES.
>
> I think talking about reclaim here is not what you want. defragmentation
> is fundamentally about moving memor not reclaim.
Sure. That is why I called the mechanism anti-fragmentation, not
defragmentation. However, If reclaimable pages are clustered together, you
know they are moveable as well. Once the pages are clustered together in
an intelligent manner, a hypothetical defragmenter would have less work to
do. Additionally, once the defragmenter starts working, you know it's less
likely to hit unmovable pages.
> Reclaim is a way of
> evicting pages from memory to avoid the move. This may be useful if memory
> is filled up because defragging can then do what swapping would have to
> do. However, evicting pages means that they have to be reread. Page
> migration can migrate pages at 1GB/sec which is certainly much higher
> than having to reread the page.
>
The reason why anti-frag currently reclaims is because reclaiming was easy
and happens under memory pressure not because I thought pageout was free.
As a proof-of-concept, I needed to show that pages clustered on
reclaimability would free contiguous blocks of pages later. There was no
point starting with defragmentation when I knew that unmovable pages would
be with movable pages in the same MAX_ORDER_NR_PAGES block.
> Also I think the reclaim idea breaks down in the following cases:
>
> 1. An mlocked page. This is a page that is movable but not reclaimable.
> How does defrag handle that case right now? It should really move the
> page if necessary.
>
Defrag doesn't exist right now. If anti-frag got some traction, working on
using page migration to handle movable-but-not-reclaimable pages would be
the next step. Pages that are mlocked() will have been allocated with
__GFP_EASYRCLM so will be clustered together with other movable pages.
> 2. There are a number of unreclaimable page types that are easily movable.
> F.e. page table pages are movable if you take a write-lock on mmap_sem
> and handle the tree carefully. These pages again are not reclaimable but
> they are movable.
>
Page tables are currently not allocated with __GFP_EASYRCLM because I knew
I couldn't reclaim them without killing processes. However, if page
migration within ranges was implemented, we'd start clustering based on
movability instead of reclaimability.
> Various caching objects in the slab (cpucache align cache etc) are also
> easily movable. If we put them into a separate slab cache then we could
> make them movable.
>
As subsystems will have pointers to objects within the slab, I doubt they
are easily movable but I'll take your word on it for the moment.
> Certain Device drivers may be able to shut down intermittendly releasing
> their memory and reallocating it later. This also may be used to move
> memory. Memory allocated by such a device driver is movable.
>
If such a driver existed in the future, their allocations could be marked
and clustered together with other movable allocations.
> I would suggest to not categorize pages according to their reclaimability
> but according to their movability.
ok, I see your point. However, reclaimability seems a reasonable starting
point. If I know pages of similar reclaimability are clustered together, I
can work on using page migration to move pages out of the blocks of known
reclaimability instead of paging them out. When that works, the __GFP_
flags identifying reclaimability can be renamed to marking movability and
flag page table pages as well. This is a logical progression.
> The decision to evict a page (reclaim)
> is something that may be useful to avoid swap but it may be better to keep
> pages in memory.
>
Agreed, but swapping them out was an easier starting point.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 22:37 ` Mel Gorman
@ 2006-11-02 22:50 ` Christoph Lameter
2006-11-03 9:14 ` Mel Gorman
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-02 22:50 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Mel Gorman wrote:
> > Reclaim is a way of
> > evicting pages from memory to avoid the move. This may be useful if memory
> > is filled up because defragging can then do what swapping would have to
> > do. However, evicting pages means that they have to be reread. Page
> > migration can migrate pages at 1GB/sec which is certainly much higher
> > than having to reread the page.
> The reason why anti-frag currently reclaims is because reclaiming was easy and
> happens under memory pressure not because I thought pageout was free. As a
> proof-of-concept, I needed to show that pages clustered on reclaimability
> would free contiguous blocks of pages later. There was no point starting with
> defragmentation when I knew that unmovable pages would be with movable pages
> in the same MAX_ORDER_NR_PAGES block.
Could you go to defrag with what we have discussed now?
> > 1. An mlocked page. This is a page that is movable but not reclaimable.
> > How does defrag handle that case right now? It should really move the
> > page if necessary.
> >
>
> Defrag doesn't exist right now. If anti-frag got some traction, working on
> using page migration to handle movable-but-not-reclaimable pages would be the
> next step. Pages that are mlocked() will have been allocated with
> __GFP_EASYRCLM so will be clustered together with other movable pages.
But mlocked pages are not reclaimable.
> > 2. There are a number of unreclaimable page types that are easily movable.
> > F.e. page table pages are movable if you take a write-lock on mmap_sem
> > and handle the tree carefully. These pages again are not reclaimable but
> > they are movable.
> >
>
> Page tables are currently not allocated with __GFP_EASYRCLM because I knew I
> couldn't reclaim them without killing processes. However, if page migration
> within ranges was implemented, we'd start clustering based on movability
> instead of reclaimability.
There would have to be a separate function to move page table pages since
they cannot be handled like regular pages. We would need some way of
id'ing the mm struct the page belongs to in order to get to the top of
the tree and to mmap_sem.
> > Various caching objects in the slab (cpucache align cache etc) are also
> > easily movable. If we put them into a separate slab cache then we could
> > make them movable.
> As subsystems will have pointers to objects within the slab, I doubt they are
> easily movable but I'll take your word on it for the moment.
The slab already has these pointers in the page struct. They are needed to
id the slab on kfree(). We already reallocate all caches when we tune the
cpucaches. So there is not much new for the slab cache objects.
> > I would suggest to not categorize pages according to their reclaimability
> > but according to their movability.
>
> ok, I see your point. However, reclaimability seems a reasonable starting
> point. If I know pages of similar reclaimability are clustered together, I can
> work on using page migration to move pages out of the blocks of known
> reclaimability instead of paging them out. When that works, the __GFP_ flags
> identifying reclaimability can be renamed to marking movability and flag page
> table pages as well. This is a logical progression.
I'd rather go direct to defrag instead of creating churn with
fragmentation avoidance.
> Agreed, but swapping them out was an easier starting point.
I think this work is very valuable and the acceptance issues have probably
dominated the design of the patch so far. But I sure wish we would now go
to the full thing instead of an intermediate step that we then will have
to undo later. An intemediate step that would make sense is starting to
marking pages as unmovable and then reclaim movable pages. Then we can add
more and more logic to make move pages movable on top. With marking
pages for reclaim we wont get there.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 22:50 ` Christoph Lameter
@ 2006-11-03 9:14 ` Mel Gorman
2006-11-03 13:17 ` Andy Whitcroft
2006-11-03 18:11 ` Christoph Lameter
0 siblings, 2 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-03 9:14 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Thu, 2 Nov 2006, Christoph Lameter wrote:
> On Thu, 2 Nov 2006, Mel Gorman wrote:
>
>>> Reclaim is a way of
>>> evicting pages from memory to avoid the move. This may be useful if memory
>>> is filled up because defragging can then do what swapping would have to
>>> do. However, evicting pages means that they have to be reread. Page
>>> migration can migrate pages at 1GB/sec which is certainly much higher
>>> than having to reread the page.
>
>> The reason why anti-frag currently reclaims is because reclaiming was easy and
>> happens under memory pressure not because I thought pageout was free. As a
>> proof-of-concept, I needed to show that pages clustered on reclaimability
>> would free contiguous blocks of pages later. There was no point starting with
>> defragmentation when I knew that unmovable pages would be with movable pages
>> in the same MAX_ORDER_NR_PAGES block.
>
> Could you go to defrag with what we have discussed now?
>
The defrag code would have to be developed first. So, no, I can't go with
defrag "now", it doesn't exist yet.
>>> 1. An mlocked page. This is a page that is movable but not reclaimable.
>>> How does defrag handle that case right now? It should really move the
>>> page if necessary.
>>>
>>
>> Defrag doesn't exist right now. If anti-frag got some traction, working on
>> using page migration to handle movable-but-not-reclaimable pages would be the
>> next step. Pages that are mlocked() will have been allocated with
>> __GFP_EASYRCLM so will be clustered together with other movable pages.
>
> But mlocked pages are not reclaimable.
>
I didn't say they were. I would mark them __GFP_EASYRCLM *when* defrag was
developed.
>>> 2. There are a number of unreclaimable page types that are easily movable.
>>> F.e. page table pages are movable if you take a write-lock on mmap_sem
>>> and handle the tree carefully. These pages again are not reclaimable but
>>> they are movable.
>>>
>>
>> Page tables are currently not allocated with __GFP_EASYRCLM because I knew I
>> couldn't reclaim them without killing processes. However, if page migration
>> within ranges was implemented, we'd start clustering based on movability
>> instead of reclaimability.
>
> There would have to be a separate function to move page table pages since
> they cannot be handled like regular pages. We would need some way of
> id'ing the mm struct the page belongs to in order to get to the top of
> the tree and to mmap_sem.
>
I know, this sort of thing would have to be written into page
migration before defrag for high-order allocations was developed. Even
then, defrag needs to sit on top of something like anti-frag to get teh
clustering of movable pages.
>>> Various caching objects in the slab (cpucache align cache etc) are also
>>> easily movable. If we put them into a separate slab cache then we could
>>> make them movable.
>> As subsystems will have pointers to objects within the slab, I doubt they are
>> easily movable but I'll take your word on it for the moment.
>
> The slab already has these pointers in the page struct. They are needed to
> id the slab on kfree(). We already reallocate all caches when we tune the
> cpucaches. So there is not much new for the slab cache objects.
>
It wasn't the pointers in the struct page I was concerned about. It was
pointers found by void *someptr = kmem_cache_alloc(...). But if they can
be cleaned up, then sure, they are movable.
>>> I would suggest to not categorize pages according to their reclaimability
>>> but according to their movability.
>>
>> ok, I see your point. However, reclaimability seems a reasonable starting
>> point. If I know pages of similar reclaimability are clustered together, I can
>> work on using page migration to move pages out of the blocks of known
>> reclaimability instead of paging them out. When that works, the __GFP_ flags
>> identifying reclaimability can be renamed to marking movability and flag page
>> table pages as well. This is a logical progression.
>
> I'd rather go direct to defrag instead of creating churn with
> fragmentation avoidance.
>
Even if I had defrag right now, we'd be looking to cluster pages by
movability which would end up looking almost identicial to the anti-frag
patches except that references to RECLAIM would look like MOVABLE.
This intermediate step would still exist but I'd like to start getting
data on it's effectiveness now to help shape the development of defrag.
>> Agreed, but swapping them out was an easier starting point.
>
> I think this work is very valuable and the acceptance issues have probably
> dominated the design of the patch so far. But I sure wish we would now go
> to the full thing instead of an intermediate step that we then will have
> to undo later.
We'd be renaming a few defines, hardly a major undo.
> An intemediate step that would make sense is starting to
> marking pages as unmovable and then reclaim movable pages. Then we can add
> more and more logic to make move pages movable on top. With marking
> pages for reclaim we wont get there.
>
Ok, I can make that renaming change now so. The renaming will look like
Movable - These are userspace pages that are easily moved. This
flag is set when it is known that the pages will be trivially
moved by using page migration or if under significant
memory pressure, writing the page out to swap or syncing with
backing storage
These allocations are marked with __GFP_MOVABLE
Reclaimable - These are kernel allocations for caches that are
reclaimable or allocations that are known to be very short-lived.
These allocations are marked __GFP_RECLAIMABLE
Non-Movable - These are pages that are allocated by the kernel that
are not trivially reclaimed. For example, the memory allocated for a
loaded module would be in this category. By default, allocations are
considered to be of this type
These are allocations that are not marked otherwise
So, right now, page tables would not be marked __GFP_MOVABLE, but they
would be later when defrag was developed. Would that be any better?
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread* Re: Page allocator: Single Zone optimizations
2006-11-03 9:14 ` Mel Gorman
@ 2006-11-03 13:17 ` Andy Whitcroft
2006-11-03 18:11 ` Christoph Lameter
1 sibling, 0 replies; 83+ messages in thread
From: Andy Whitcroft @ 2006-11-03 13:17 UTC (permalink / raw)
To: Mel Gorman, Christoph Lameter
Cc: Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
>> An intemediate step that would make sense is starting to
>> marking pages as unmovable and then reclaim movable pages. Then we can
>> add
>> more and more logic to make move pages movable on top. With marking
>> pages for reclaim we wont get there.
>>
>
> Ok, I can make that renaming change now so. The renaming will look like
>
> Movable - These are userspace pages that are easily moved. This
> flag is set when it is known that the pages will be trivially
> moved by using page migration or if under significant
> memory pressure, writing the page out to swap or syncing with
> backing storage
> These allocations are marked with __GFP_MOVABLE
>
> Reclaimable - These are kernel allocations for caches that are
> reclaimable or allocations that are known to be very short-lived.
> These allocations are marked __GFP_RECLAIMABLE
>
> Non-Movable - These are pages that are allocated by the kernel that
> are not trivially reclaimed. For example, the memory allocated
> for a
> loaded module would be in this category. By default, allocations
> are
> considered to be of this type
> These are allocations that are not marked otherwise
>
> So, right now, page tables would not be marked __GFP_MOVABLE, but they
> would be later when defrag was developed. Would that be any better?
Ok, as far as I can tell you are both describing the same basic thing
with different names.
The key problem here is we want to be able to allocate non-order zero
pages, where there are such pages available all is well. When there are
not we need to look for a group of contiguous 'emptyable' pages; and
recycle them. This is the key, we do not care what is in them, only
whether we can get whatever it is out to release a single contiguous
block. We do not care what the mechanism for that is, release, move or
even swap them. The attribute of the memory is whether its pinned or
not, whether the page is emptyable. We want to make sure we keep
emptyable pages with other emptyable pages so that our chances of
finding a higher order block emptyable is likely.
We currently talk about the act of selecting pages for release as
reclaim. We should not get too caught up in thinking of that as
removing things from memory. Yes, right now the only time we use
reclaim is when we do not have any free memory, and so its only goal is
to remove things from memory -- that is a side effect of it only
supporting order 0 reclaim, moving a page there is mostly useless.
Supporting higher order reclaim we might start reclaim at order 1 with
50% of memory free. In this case the reclaim strategy could and should
include the option to relocate the relocatable memory object.
Now perhaps 'EMPTYABLE', 'RELEASABLE' or 'RECYCLABLE' is more
appropriate than 'RECLAIMABLE', but its not at all clear that 'MOVABLE'
is better. Moving pages is but one strategy 'reclaim' could use to
achieve its aim, getting us the memory block we asked for.
I do not see how any of what Mel is saying precludes the use of
migration as a reclaim mechanism allowing more things to be placed in
the 'emptyable' set rather than the 'pinned' set.
-apw
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 9:14 ` Mel Gorman
2006-11-03 13:17 ` Andy Whitcroft
@ 2006-11-03 18:11 ` Christoph Lameter
2006-11-03 19:06 ` Mel Gorman
1 sibling, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 18:11 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Mel Gorman wrote:
> I know, this sort of thing would have to be written into page migration before
> defrag for high-order allocations was developed. Even then, defrag needs to
> sit on top of something like anti-frag to get teh clustering of movable pages.
Hmmm... The disk defraggers are capable of defragmenting around pinned
blocks and this seems to be a similar. This only works if the number of
unmovable objects is small compared to the movable objects otherwise we
may need this sorting. For other reasons discussed before (memory unplug,
node unplug) I think it would be necessary to have this separation
between movable and unmovable pages.
I can add a migrate_page_table_page() function? The migrate_pages()
function is only capable of migrating user space pages since it relies on
being able to take pages off the LRU. At some point we need to
distinguishthe type of page and call the appropriate migration function
for the various page types.
int migrate_page_table_page(struct page *new, struct page *new);
?
> Reclaimable - These are kernel allocations for caches that are
> reclaimable or allocations that are known to be very short-lived.
> These allocations are marked __GFP_RECLAIMABLE
For now this would include reclaimable slabs? They are reclaimable with a
huge effort and there may be pinned objects that we cannot move. Isnt this
more another case of unmovable? Or can we tolerate the objects that cannot
be moved and classify this as movable (with the understanding that we may
have to do expensive slab reclaim (up to dropping all reclaimable slabs)
in order to get there).
> Non-Movable - These are pages that are allocated by the kernel that
> are not trivially reclaimed. For example, the memory allocated for a
> loaded module would be in this category. By default, allocations are
> considered to be of this type
> These are allocations that are not marked otherwise
Ok.
Note that memory for a loaded module is allocated via vmalloc, mapped via
a page table (init_mm) and thus memory is remappable. We will likely be
able to move those.
> So, right now, page tables would not be marked __GFP_MOVABLE, but they would
> be later when defrag was developed. Would that be any better?
Isnt this is still doing reclaim instead of defragmentation? Maybe it
will work but I am not not sure about the performance impact. We
would have to read pages back in from swap or disk?
The problem that we have is that one cannot higher order pages since
memory is fragmented. Maybe what would initially be sufficient is that a failing
allocation of a higher order page lead to defrag occurring until pages of
suffiecient size have been created and then the allocation can be satisfied.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 18:11 ` Christoph Lameter
@ 2006-11-03 19:06 ` Mel Gorman
2006-11-03 19:44 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Mel Gorman @ 2006-11-03 19:06 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Christoph Lameter wrote:
> On Fri, 3 Nov 2006, Mel Gorman wrote:
>
>> I know, this sort of thing would have to be written into page migration before
>> defrag for high-order allocations was developed. Even then, defrag needs to
>> sit on top of something like anti-frag to get teh clustering of movable pages.
>
> Hmmm... The disk defraggers are capable of defragmenting around pinned
> blocks and this seems to be a similar.
Not similar enough. Disk defragmentation aims at having files as
contiguous as possible on the filesystem. if they are not contiguous, it
doesn't matter to functionality but performance degrades slightly.
For allocation of hugepages, the physical pages must be contiguous and
they must be aligned. If there is one unmovable or unreclaimable page in
there, that block is unusable for a hugepage. We can defragment around it
all right, but the resulting block is still not usable. It's not the same
as disk defragmentation.
Defragmentation on it's own is not enough. The clustering based on
reclaimability/movability is still required and that is what anti-frag
provides.
> This only works if the number of
> unmovable objects is small compared to the movable objects otherwise we
> may need this sorting. For other reasons discussed before (memory unplug,
> node unplug) I think it would be necessary to have this separation
> between movable and unmovable pages.
>
If there is only one unmovable block per MAX_ORDER_NR_PAGES in the system,
you can defrag as much as you like and hugepage allocations will still
fail. Similar for hot unplug.
> I can add a migrate_page_table_page() function? The migrate_pages()
> function is only capable of migrating user space pages since it relies on
> being able to take pages off the LRU. At some point we need to
> distinguishthe type of page and call the appropriate migration function
> for the various page types.
>
If such a function existed, then page table pages could be placed beside
"reclaimable" pages and the block could be migrated. However, the
clustering would still have be needed, be it based on reclaimability or
movability (which in many cases is the same thing)
> int migrate_page_table_page(struct page *new, struct page *new);
> ?
>
>> Reclaimable - These are kernel allocations for caches that are
>> reclaimable or allocations that are known to be very short-lived.
>> These allocations are marked __GFP_RECLAIMABLE
>
> For now this would include reclaimable slabs?
It could, but I don't. Currently, only network buffers, inode caches,
buffer heads and dentries are marked like this.
> They are reclaimable with a
> huge effort and there may be pinned objects that we cannot move. Isnt this
> more another case of unmovable?
Probably, they would currently be treated as unmovable.
> Or can we tolerate the objects that cannot
> be moved and classify this as movable (with the understanding that we may
> have to do expensive slab reclaim (up to dropping all reclaimable slabs)
> in order to get there).
>
There is nothing stopping such marking taking place, but I wouldn't if I
thought that reclaiming or moving them was that expensive.
>> Non-Movable - These are pages that are allocated by the kernel that
>> are not trivially reclaimed. For example, the memory allocated for a
>> loaded module would be in this category. By default, allocations are
>> considered to be of this type
>> These are allocations that are not marked otherwise
>
> Ok.
>
> Note that memory for a loaded module is allocated via vmalloc, mapped via
> a page table (init_mm) and thus memory is remappable. We will likely be
> able to move those.
>
It's not just a case of updating init_mm. You would also need to tear down
the vmalloc area for every current running process in the system in case
they had faulted within that module. That would be pretty entertaining.
>> So, right now, page tables would not be marked __GFP_MOVABLE, but they would
>> be later when defrag was developed. Would that be any better?
>
> Isnt this is still doing reclaim instead of defragmentation?
Not necessarily reclaim. Currently we reclaim. Under memory pressure, we
may still reclaim. However, if there was enough free memory (due to
min_free_kbytes been set to a higher value for example), then we could
migrate instead of reclaim to satisfy a high-order allocation. The page
migration stuff is already there so it's clearly possible.
Once again, I am not adverse to writing such a defragment mechanism, but I
see anti-frag as it currently stands as a prequisitie for a
defragmentation mechanism having a decent success rate.
> Maybe it
> will work but I am not not sure about the performance impact. We
> would have to read pages back in from swap or disk?
>
> The problem that we have is that one cannot higher order pages since
> memory is fragmented. Maybe what would initially be sufficient is that a
> failing allocation of a higher order page lead to defrag occurring until
> pages of suffiecient size have been created and then the allocation can
> be satisfied.
>
Defragmentation on it's own would be insufficient for hugepage allocations
because of unmovable pages dotted around the system. We know this because
if you reclaim everything possible in the system, you still are unlikely
to be able to grow the hugepage pool. If reclaiming everything doesn't
give you huge pages, shuffling the same pages around the system won't
improve the situation any either.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 19:06 ` Mel Gorman
@ 2006-11-03 19:44 ` Christoph Lameter
2006-11-03 21:11 ` Mel Gorman
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 19:44 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Mel Gorman wrote:
> > For now this would include reclaimable slabs?
>
> It could, but I don't. Currently, only network buffers, inode caches, buffer
> heads and dentries are marked like this.
inode cache and dentries basically contain most of the reclaimable
slab caches.
> > They are reclaimable with a
> > huge effort and there may be pinned objects that we cannot move. Isnt this
> > more another case of unmovable?
>
> Probably, they would currently be treated as unmovable.
So you really do not currently need that section? If you drop the section
then we have the same distinction that we wouild need for memory hotplug.
> > Note that memory for a loaded module is allocated via vmalloc, mapped via
> > a page table (init_mm) and thus memory is remappable. We will likely be
> > able to move those.
> >
>
> It's not just a case of updating init_mm. You would also need to tear down the
> vmalloc area for every current running process in the system in case they had
> faulted within that module. That would be pretty entertaining.
vmalloc areas are not process specific and this works just fine within the
kernel. Eeek... remap_vmalloc_range() maps into user space. Need to have a
list it seems to be able to also update those ptes.
> Once again, I am not adverse to writing such a defragment mechanism, but I see
> anti-frag as it currently stands as a prequisitie for a defragmentation
> mechanism having a decent success rate.
What you call anti-frag is really a mechanism to separate two different
kinds of allocations that may be useful for multiple purposes not only
anti-frag.
> Defragmentation on it's own would be insufficient for hugepage allocations
> because of unmovable pages dotted around the system. We know this because if
> you reclaim everything possible in the system, you still are unlikely to be
> able to grow the hugepage pool. If reclaiming everything doesn't give you huge
> pages, shuffling the same pages around the system won't improve the situation
It all depends on the movability of pages. If unmovable pages are
sufficiently rare then this will work.
I think we need something like what is done here via anti-frag but I wish
it would be more generic and not solely rely on reclaim to get pages freed
up.
Also the duplication of the page struct caches worries me because it
reduces the hit rate. Removing the intermediate type would reduce the page
caches to 2. And maybe we do not need caches for unreclaimable/unmovable
pages? slab already does its own buffering there.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 19:44 ` Christoph Lameter
@ 2006-11-03 21:11 ` Mel Gorman
2006-11-03 21:42 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Mel Gorman @ 2006-11-03 21:11 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Christoph Lameter wrote:
> On Fri, 3 Nov 2006, Mel Gorman wrote:
>
>>> For now this would include reclaimable slabs?
>>
>> It could, but I don't. Currently, only network buffers, inode caches, buffer
>> heads and dentries are marked like this.
>
> inode cache and dentries basically contain most of the reclaimable
> slab caches.
>
Yes, and they are the largest amount of memory allocated by a significant
margin. When they are clustered together, cache shrinking tends to free up
contiguous blocks of pages.
>>> They are reclaimable with a
>>> huge effort and there may be pinned objects that we cannot move. Isnt this
>>> more another case of unmovable?
>>
>> Probably, they would currently be treated as unmovable.
>
> So you really do not currently need that section? If you drop the section
> then we have the same distinction that we wouild need for memory hotplug.
>
You mean, drop the section dealing with clustering the cache and dentries?
That section is needed. Without it, success rates at succeeding high order
allocations is lower and the mechanism breaks down after a few hours
uptime.
>>> Note that memory for a loaded module is allocated via vmalloc, mapped via
>>> a page table (init_mm) and thus memory is remappable. We will likely be
>>> able to move those.
>>>
>>
>> It's not just a case of updating init_mm. You would also need to tear down the
>> vmalloc area for every current running process in the system in case they had
>> faulted within that module. That would be pretty entertaining.
>
> vmalloc areas are not process specific
> and this works just fine within the
> kernel. Eeek... remap_vmalloc_range() maps into user space. Need to have a
> list it seems to be able to also update those ptes.
>
>> Once again, I am not adverse to writing such a defragment mechanism, but I see
>> anti-frag as it currently stands as a prequisitie for a defragmentation
>> mechanism having a decent success rate.
>
> What you call anti-frag is really a mechanism to separate two different
> kinds of allocations that may be useful for multiple purposes not only
> anti-frag.
>
Well, currently three types of allocations. It's worth separating out
really unmovable pages and kernel allocations that can be reclaimed/moved
in some fashion.
Is it the name anti-frag you have a problem with? If so, what would you
suggest calling it?
>> Defragmentation on it's own would be insufficient for hugepage allocations
>> because of unmovable pages dotted around the system. We know this because if
>> you reclaim everything possible in the system, you still are unlikely to be
>> able to grow the hugepage pool. If reclaiming everything doesn't give you huge
>> pages, shuffling the same pages around the system won't improve the situation
>
> It all depends on the movability of pages. If unmovable pages are
> sufficiently rare then this will work.
>
They are common enough that they get spread throughout memory unless they
are clustered. If that was not the case, the hugepage pool would be a lot
easier to grow after a decent amount of uptime.
> I think we need something like what is done here via anti-frag but I wish
> it would be more generic and not solely rely on reclaim to get pages freed
> up.
>
How could it have been made more generic? Fundamentally, all we are doing
at the moment is using the freelists to cluster types of pages together.
We only depend on reclaim now. If we get the clustering part done, I can
start working on the page migration part.
> Also the duplication of the page struct caches worries me because it
> reduces the hit rate.
do you mean the per-cpu caches? If so, without clustering in the per-cpu
caches, unmovable allocations would "leak" into blocks used for movable
allocations.
> Removing the intermediate type would reduce the page
> caches to 2.
And significantly reduce the effectiveness of the clustering in the
process.
> And maybe we do not need caches for unreclaimable/unmovable
> pages? slab already does its own buffering there.
>
That is true. If it is a problem, what could be done is have a per-cpu
cache for movable and unmovable allocations. Then have the __GFP_KERNRCLM
allocations bypass the per-cpu allocator altogether and go straight to the
buddy allocator.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 21:11 ` Mel Gorman
@ 2006-11-03 21:42 ` Christoph Lameter
2006-11-03 21:50 ` Andrew Morton
2006-11-07 16:30 ` Mel Gorman
0 siblings, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 21:42 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Mel Gorman wrote:
> > I think we need something like what is done here via anti-frag but I wish
> > it would be more generic and not solely rely on reclaim to get pages freed
> > up.
> >
>
> How could it have been made more generic? Fundamentally, all we are doing at
> the moment is using the freelists to cluster types of pages together. We only
> depend on reclaim now. If we get the clustering part done, I can start working
> on the page migration part.
Right lets have a special freelist for unreclaim/unmovable pages. I think
that we agree on m that. Somehow we need to be able to insure that
unXXXable pages do not end up in sections of the zone where we allow
memory hotplug.
At some later point we would like to have the ability to redirect
unXXXable allocations to another node if the node is hot pluggable.
> > Also the duplication of the page struct caches worries me because it
> > reduces the hit rate.
>
> do you mean the per-cpu caches? If so, without clustering in the per-cpu
> caches, unmovable allocations would "leak" into blocks used for movable
> allocations.
I mean the per cpu caches and I think you could just bypass the per cpu
caches for unXXXable pages. Kernel pages are buffered already in the slab
allocator and other kernel allocations are probably rare enough.
> > Removing the intermediate type would reduce the page
> > caches to 2.
>
> And significantly reduce the effectiveness of the clustering in the process.
Are you sure about this? It seems that the intermediate type is
reclaimable and you already allow "reclaimable" pages to be not reclaimable
(mlock'ed pages). If you run into trouble with the reclaimable slab pages
in the reclaimable zone then you could do agressive slab reclaim to remedy
the situation.
> > And maybe we do not need caches for unreclaimable/unmovable
> > pages? slab already does its own buffering there.
> That is true. If it is a problem, what could be done is have a per-cpu cache
> for movable and unmovable allocations. Then have the __GFP_KERNRCLM
> allocations bypass the per-cpu allocator altogether and go straight to the
> buddy allocator.
Right. Maybe we can get away with leaving the pageset cpu caches
untouched? On our largest systems with 1k nodes 4k cpus we currently have
4 zones * 4096 cpus * 1024 nodes = 16 million pagesets. Each of those has
hot and cold yielding 32 million lists. Now we going triplicate that to
192 mio lists and we also increase the size of the structure.
With the code currently in 2.6.19 we go from 4 to 2 zones. So we have only
16 million pagesets. With the optional DMA in mm we got from 16 to 8
million pagesets. This effectively undoes the optimizations done in .19
.20.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 21:42 ` Christoph Lameter
@ 2006-11-03 21:50 ` Andrew Morton
2006-11-03 21:53 ` Christoph Lameter
2006-11-07 16:30 ` Mel Gorman
1 sibling, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-11-03 21:50 UTC (permalink / raw)
To: Christoph Lameter
Cc: Mel Gorman, Andy Whitcroft, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006 13:42:16 -0800 (PST)
Christoph Lameter <clameter@sgi.com> wrote:
> Kernel pages are buffered already in the slab
> allocator
But why? I've been intermittently campaigning to stop doing that for about
five years now. Having private lists of free pages in the slab allocator
is duplicative of the page allocator's lists and worsens performance.
In fact I thought we'd stopped doing this ages ago.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 21:50 ` Andrew Morton
@ 2006-11-03 21:53 ` Christoph Lameter
2006-11-03 22:12 ` Andrew Morton
2006-11-03 22:19 ` Andi Kleen
0 siblings, 2 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 21:53 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Andy Whitcroft, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Andrew Morton wrote:
> > Kernel pages are buffered already in the slab
> > allocator
>
> But why? I've been intermittently campaigning to stop doing that for about
> five years now. Having private lists of free pages in the slab allocator
> is duplicative of the page allocator's lists and worsens performance.
This has to do with the constructors and the destructors. They are only
applied during the first allocation or the final deallocation of the slab.
The slab (with the pages) stays on the freelist with all objects having
proper values as set by the constructors.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 21:53 ` Christoph Lameter
@ 2006-11-03 22:12 ` Andrew Morton
2006-11-03 22:15 ` Christoph Lameter
2006-11-03 22:19 ` Andi Kleen
1 sibling, 1 reply; 83+ messages in thread
From: Andrew Morton @ 2006-11-03 22:12 UTC (permalink / raw)
To: Christoph Lameter
Cc: Mel Gorman, Andy Whitcroft, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006 13:53:55 -0800 (PST)
Christoph Lameter <clameter@sgi.com> wrote:
> On Fri, 3 Nov 2006, Andrew Morton wrote:
>
> > > Kernel pages are buffered already in the slab
> > > allocator
> >
> > But why? I've been intermittently campaigning to stop doing that for about
> > five years now. Having private lists of free pages in the slab allocator
> > is duplicative of the page allocator's lists and worsens performance.
>
> This has to do with the constructors and the destructors. They are only
> applied during the first allocation or the final deallocation of the slab.
> The slab (with the pages) stays on the freelist with all objects having
> proper values as set by the constructors.
That's possibly useful if the cache has a destructor. If it has a
constructor and no destructor then there's no point in locally caching the
pages.
But destructors are a bad idea: you dirty a cacheline, evict something else
and then let the cacheline just sit there and go stale.
But I thought that slab once-upon-a-time retained caches of plain old free
pages, not in any particular state. Maybe it did and maybe we did remove
that.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 22:12 ` Andrew Morton
@ 2006-11-03 22:15 ` Christoph Lameter
0 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 22:15 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Andy Whitcroft, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Andrew Morton wrote:
> That's possibly useful if the cache has a destructor. If it has a
> constructor and no destructor then there's no point in locally caching the
> pages.
>
> But destructors are a bad idea: you dirty a cacheline, evict something else
> and then let the cacheline just sit there and go stale.
Right thats why I tried to avoid constructors and destructors for the new
slab design but it is important for RCU since the object must be in a
defined state even after a free. i386 arch code does some weird wizardry
with it. So I had to add a support layer.
> But I thought that slab once-upon-a-time retained caches of plain old free
> pages, not in any particular state. Maybe it did and maybe we did remove
> that.
Must have been before my time.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 21:53 ` Christoph Lameter
2006-11-03 22:12 ` Andrew Morton
@ 2006-11-03 22:19 ` Andi Kleen
2006-11-04 0:37 ` Christoph Lameter
1 sibling, 1 reply; 83+ messages in thread
From: Andi Kleen @ 2006-11-03 22:19 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
> This has to do with the constructors and the destructors. They are only
> applied during the first allocation or the final deallocation of the slab.
It's pretty much obsolete though - nearly nobody uses constructors/destructors.
And the few uses left over are useless to avoid cache misses
and could as well be removed.
Long ago i fixed some code to use constructors and made sure it carefully
avoided some cache misses in the hot path, but typically when people change
anything later they destroy that. It's just not maintainable.
I would vote for just getting rid of slab constructors/destructors.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 22:19 ` Andi Kleen
@ 2006-11-04 0:37 ` Christoph Lameter
2006-11-04 1:32 ` Andi Kleen
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-04 0:37 UTC (permalink / raw)
To: Andi Kleen
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Andi Kleen wrote:
>
> > This has to do with the constructors and the destructors. They are only
> > applied during the first allocation or the final deallocation of the slab.
>
> It's pretty much obsolete though - nearly nobody uses constructors/destructors.
> And the few uses left over are useless to avoid cache misses
> and could as well be removed.
>
> Long ago i fixed some code to use constructors and made sure it carefully
> avoided some cache misses in the hot path, but typically when people change
> anything later they destroy that. It's just not maintainable.
>
> I would vote for just getting rid of slab constructors/destructors.
I would appreciate patches to that effect, voting will not help much. It
would make my new slab project much easier. But I doubt that this is as
easy as you think. F.e. I wonder how you going to do anonvma RCU without
constructors. I think constructors/destructors are here to stay.
One thing I would appreciate very much and its in your area. Deal
with the use of slab for page size allocations (pmd, pgd etc) in i386 arch
code. The page struct in use there is overloaded both by slab and by the
i386 arch code.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-04 0:37 ` Christoph Lameter
@ 2006-11-04 1:32 ` Andi Kleen
2006-11-06 16:40 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andi Kleen @ 2006-11-04 1:32 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
> I would appreciate patches to that effect, voting will not help much. It
> would make my new slab project much easier. But I doubt that this is as
> easy as you think. F.e. I wonder how you going to do anonvma RCU without
> constructors. I think constructors/destructors are here to stay.
Hmm. Why? Why can't the work of the constructor not be done after the
kmem_cache_alloc() ?
> One thing I would appreciate very much and its in your area. Deal
> with the use of slab for page size allocations (pmd, pgd etc) in i386 arch
> code.
I can do that for pte/pmd. Never quite understood why those were made
slabs -- on x86-64 they are just pages and that works great.
But in PAE pgd is only 32 bytes. That will always need a smaller allocation.
But that shouldn't be overloaded anyways.
Actually I think it could be probably just put into mm_context_t
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-04 1:32 ` Andi Kleen
@ 2006-11-06 16:40 ` Christoph Lameter
2006-11-06 16:56 ` Andi Kleen
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-06 16:40 UTC (permalink / raw)
To: Andi Kleen
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Sat, 4 Nov 2006, Andi Kleen wrote:
> > I would appreciate patches to that effect, voting will not help much. It
> > would make my new slab project much easier. But I doubt that this is as
> > easy as you think. F.e. I wonder how you going to do anonvma RCU without
> > constructors. I think constructors/destructors are here to stay.
>
> Hmm. Why? Why can't the work of the constructor not be done after the
> kmem_cache_alloc() ?
Because acceses to the structure can occur after kfree. The RCU
implementation only delays the destruction of the slab. Locks are always
in a definite state regardless if the object is in use or not.
> > One thing I would appreciate very much and its in your area. Deal
> > with the use of slab for page size allocations (pmd, pgd etc) in i386 arch
> > code.
> I can do that for pte/pmd. Never quite understood why those were made
> slabs -- on x86-64 they are just pages and that works great.
I think this is an attempt to avoid having to initialize pmds/pgds after
intializaiton and also the use of the slab caches keeps the cache lines
hot.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 16:40 ` Christoph Lameter
@ 2006-11-06 16:56 ` Andi Kleen
2006-11-06 17:00 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andi Kleen @ 2006-11-06 16:56 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Monday 06 November 2006 17:40, Christoph Lameter wrote:
> On Sat, 4 Nov 2006, Andi Kleen wrote:
>
> > > I would appreciate patches to that effect, voting will not help much. It
> > > would make my new slab project much easier. But I doubt that this is as
> > > easy as you think. F.e. I wonder how you going to do anonvma RCU without
> > > constructors. I think constructors/destructors are here to stay.
> >
> > Hmm. Why? Why can't the work of the constructor not be done after the
> > kmem_cache_alloc() ?
>
> Because acceses to the structure can occur after kfree. The RCU
> implementation only delays the destruction of the slab. Locks are always
> in a definite state regardless if the object is in use or not.
Only objects that have been used at least once can be still visible. And
those would be still constructed of course -- just after the kmem_cache_alloc,
not inside. For those that have never been used it shouldn't matter.
> > > One thing I would appreciate very much and its in your area. Deal
> > > with the use of slab for page size allocations (pmd, pgd etc) in i386 arch
> > > code.
> > I can do that for pte/pmd. Never quite understood why those were made
> > slabs -- on x86-64 they are just pages and that works great.
>
> I think this is an attempt to avoid having to initialize pmds/pgds after
> intializaiton and also the use of the slab caches keeps the cache lines
> hot.
Ah, we got __GFP_ZERO for that, although it never quite did the work
completely. I'm not sure it helps a lot anyways
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 16:56 ` Andi Kleen
@ 2006-11-06 17:00 ` Christoph Lameter
2006-11-06 17:07 ` Andi Kleen
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-06 17:00 UTC (permalink / raw)
To: Andi Kleen
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Mon, 6 Nov 2006, Andi Kleen wrote:
> > Because acceses to the structure can occur after kfree. The RCU
> > implementation only delays the destruction of the slab. Locks are always
> > in a definite state regardless if the object is in use or not.
>
> Only objects that have been used at least once can be still visible. And
> those would be still constructed of course -- just after the kmem_cache_alloc,
> not inside. For those that have never been used it shouldn't matter.
Constructors are only called on allocation of the slab, not on
kmem_cache_alloc. And you are right: It does not matter for those that
have never been used.
> > I think this is an attempt to avoid having to initialize pmds/pgds after
> > intializaiton and also the use of the slab caches keeps the cache lines
> > hot.
>
> Ah, we got __GFP_ZERO for that, although it never quite did the work
> completely. I'm not sure it helps a lot anyways
Not exactly. The implementation in the i386 arch code avoids the
__GFP_ZERO by relying on empty pgd/pmds be zero. But you could copy Robin
Holt's implementation via page lists from ia64 that does the saem. It
avoids the constructor/destructors and slab use. It is cleaner and
probably faster.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 17:00 ` Christoph Lameter
@ 2006-11-06 17:07 ` Andi Kleen
2006-11-06 17:12 ` Hugh Dickins
2006-11-06 17:15 ` Christoph Lameter
0 siblings, 2 replies; 83+ messages in thread
From: Andi Kleen @ 2006-11-06 17:07 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Monday 06 November 2006 18:00, Christoph Lameter wrote:
> On Mon, 6 Nov 2006, Andi Kleen wrote:
>
> > > Because acceses to the structure can occur after kfree. The RCU
> > > implementation only delays the destruction of the slab. Locks are always
> > > in a definite state regardless if the object is in use or not.
> >
> > Only objects that have been used at least once can be still visible. And
> > those would be still constructed of course -- just after the kmem_cache_alloc,
> > not inside. For those that have never been used it shouldn't matter.
>
> Constructors are only called on allocation of the slab, not on
> kmem_cache_alloc.
I know this.
> And you are right: It does not matter for those that
> have never been used.
This means it is fine to replace the constructor with an function
that runs after kmem_cache_alloc() in this case.
> > > I think this is an attempt to avoid having to initialize pmds/pgds after
> > > intializaiton and also the use of the slab caches keeps the cache lines
> > > hot.
> >
> > Ah, we got __GFP_ZERO for that, although it never quite did the work
> > completely. I'm not sure it helps a lot anyways
>
> Not exactly. The implementation in the i386 arch code avoids the
> __GFP_ZERO by relying on empty pgd/pmds be zero. But you could copy Robin
> Holt's implementation via page lists from ia64 that does the saem. It
> avoids the constructor/destructors and slab use. It is cleaner and
> probably faster.
i386 used to have such lists some time ago too, until they were removed.
What I meant: some time ago i had patches to add a __GFP_ZERO queue to the
page allocator. The page allocator would handle all this for everybody.
For various reasons they never got pushed.
But I am not sure it is worth it all that much because there are not
that many PMDs allocated typically.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 17:07 ` Andi Kleen
@ 2006-11-06 17:12 ` Hugh Dickins
2006-11-06 17:15 ` Christoph Lameter
1 sibling, 0 replies; 83+ messages in thread
From: Hugh Dickins @ 2006-11-06 17:12 UTC (permalink / raw)
To: Andi Kleen
Cc: Christoph Lameter, Andrew Morton, Mel Gorman, Andy Whitcroft,
Nick Piggin, KAMEZAWA Hiroyuki, Linux Memory Management List,
Peter Zijlstra
On Mon, 6 Nov 2006, Andi Kleen wrote:
> On Monday 06 November 2006 18:00, Christoph Lameter wrote:
> > On Mon, 6 Nov 2006, Andi Kleen wrote:
> >
> > > > Because acceses to the structure can occur after kfree. The RCU
> > > > implementation only delays the destruction of the slab. Locks are always
> > > > in a definite state regardless if the object is in use or not.
> > >
> > > Only objects that have been used at least once can be still visible. And
> > > those would be still constructed of course -- just after the kmem_cache_alloc,
> > > not inside. For those that have never been used it shouldn't matter.
> >
> > Constructors are only called on allocation of the slab, not on
> > kmem_cache_alloc.
>
> I know this.
>
> > And you are right: It does not matter for those that
> > have never been used.
>
> This means it is fine to replace the constructor with an function
> that runs after kmem_cache_alloc() in this case.
But where will you do the spin_lock_init?
Hugh
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 17:07 ` Andi Kleen
2006-11-06 17:12 ` Hugh Dickins
@ 2006-11-06 17:15 ` Christoph Lameter
2006-11-06 17:20 ` Andi Kleen
1 sibling, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-06 17:15 UTC (permalink / raw)
To: Andi Kleen
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Mon, 6 Nov 2006, Andi Kleen wrote:
> I know this.
>
> > And you are right: It does not matter for those that
> > have never been used.
>
> This means it is fine to replace the constructor with an function
> that runs after kmem_cache_alloc() in this case.
No its not. RCU means that there are potential accesses after a object has
been freed and even after an object has been reallocated via
kmem_cache_alloc. A function that runs after kmem_cache_alloc() may
mess up the lock state.
> What I meant: some time ago i had patches to add a __GFP_ZERO queue to the
> page allocator. The page allocator would handle all this for everybody.
> For various reasons they never got pushed.
Yup that was probably my patchset. The problem was that I could not make
the case that this was beneficial if all cache lines of a page were
touched. It was a significant performance benefit only for sparsely
accessed pages.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 17:15 ` Christoph Lameter
@ 2006-11-06 17:20 ` Andi Kleen
2006-11-06 17:26 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Andi Kleen @ 2006-11-06 17:20 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
> No its not. RCU means that there are potential accesses after a object has
> been freed and even after an object has been reallocated via
> kmem_cache_alloc. A function that runs after kmem_cache_alloc() may
> mess up the lock state.
Ok, got it. How messy.
>From my previous slab experiences I predict it will not work anymore in less than
half a year. Such fragile constructions never tend to hold long.
> > What I meant: some time ago i had patches to add a __GFP_ZERO queue to the
> > page allocator. The page allocator would handle all this for everybody.
> > For various reasons they never got pushed.
>
> Yup that was probably my patchset.
That was an own patch by me. But it was pretty obvious so I'm sure
others had the same idea.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-06 17:20 ` Andi Kleen
@ 2006-11-06 17:26 ` Christoph Lameter
0 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-06 17:26 UTC (permalink / raw)
To: Andi Kleen
Cc: Andrew Morton, Mel Gorman, Andy Whitcroft, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List, Peter Zijlstra
On Mon, 6 Nov 2006, Andi Kleen wrote:
> From my previous slab experiences I predict it will not work anymore in less than
> half a year. Such fragile constructions never tend to hold long.
This has been around for quite some time. The signalhand cache uses the
same method.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 21:42 ` Christoph Lameter
2006-11-03 21:50 ` Andrew Morton
@ 2006-11-07 16:30 ` Mel Gorman
2006-11-07 17:54 ` Christoph Lameter
2006-11-13 21:08 ` Mel Gorman
1 sibling, 2 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-07 16:30 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Fri, 3 Nov 2006, Christoph Lameter wrote:
> On Fri, 3 Nov 2006, Mel Gorman wrote:
>
> > > I think we need something like what is done here via anti-frag but I
> > > wish
> > > it would be more generic and not solely rely on reclaim to get pages
> > > freed
> > > up.
> > >
> >
> > How could it have been made more generic? Fundamentally, all we are
> > doing at
> > the moment is using the freelists to cluster types of pages together. We
> > only
> > depend on reclaim now. If we get the clustering part done, I can start
> > working
> > on the page migration part.
>
> Right lets have a special freelist for unreclaim/unmovable pages. I think
> that we agree on m that. Somehow we need to be able to insure that
> unXXXable pages do not end up in sections of the zone where we allow
> memory hotplug.
>
Ok, I redid the patches (calling it Page Clustering to distinguish between
anti-frag) to split between two types of pages
o Unmovable and unreclaimable pages
o Movable pages and those slab pages that are reclaimable
The per-cpu allocator could only be used for the second type of pages
(i.e. Movable/Reclaimable pages) to avoid leaking or expanding the
per-cpu structures.
I thought it would work ok but have a tendancy to reap caches aggressively
to satisfy hugepage allocations. I was wrong, it's success rates for
hugepage allocations was even lower than the normal allocator on ppc64. On
x86_64, it was a bit better than the standard allocator, probably because of
the smaller hugepage size, but nowhere near as successful as
anti-fragmentation.
To guess what went wrong, I examined the slabinfo on ppc64 after the tests,
I found that even though slab was reaped aggressively, 105 pages were still
in use by the ext3 inode cache. As there are only 62 hugepages on the
system, it's easy to see why they would end up in awkward locations.
Here are some results at the end of tests when the system is back at rest.
They are the percentage of memory that can be allocated as hugepages after a
series of tests that cause fragmentation. The system is a ppc64 with 1GiB of
RAM
Standard allocator: 9%
Page Clustering: 3%
Anti-fragmentation: 56%
Here are the results on x86_64 which has a much smaller hugepage size
Standard allocator: 21%
Page Clustering: 33%
Anti-fragmentation: 56%
I know from previous experience that keeping slab pages with unmovable pages
does not work very well because updatedb or memory pressure tends to cause
problems.
Hence, I'm still convinced that slab pages for caches like inode and
short-lived allocations need to be clustered separetly.
> At some later point we would like to have the ability to redirect
> unXXXable allocations to another node if the node is hot pluggable.
>
That can be done separately.
> > > Also the duplication of the page struct caches worries me because it
> > > reduces the hit rate.
> >
> > do you mean the per-cpu caches? If so, without clustering in the per-cpu
> > caches, unmovable allocations would "leak" into blocks used for movable
> > allocations.
>
> I mean the per cpu caches and I think you could just bypass the per cpu
> caches for unXXXable pages. Kernel pages are buffered already in the slab
> allocator and other kernel allocations are probably rare enough.
>
I'll revisit the patches again, cluster slab caches separately but only
allow movable allocations to go through per-cpu and see what happens.
> > > Removing the intermediate type would reduce the page
> > > caches to 2.
> >
> > And significantly reduce the effectiveness of the clustering in the
> > process.
>
> Are you sure about this?
I think the figures support me on this.
> It seems that the intermediate type is
> reclaimable and you already allow "reclaimable" pages to be not
> reclaimable
> (mlock'ed pages).
I know, but not because I'm happy about it. As mlock() can be called after a
fault, knowledge of the future is required to keep all mlocked pages out of
"reclaimable" areas. It's similar for pages pinned by RDMA. To address
mlock() pages properly, page migration needs to be used.
I can create a patch later to handle the case where a page is being faulted
in due to mlock() and not to mark it "reclaimable" if it's known that they
are the majority of mlocked pages.
> If you run into trouble with the reclaimable slab pages
> in the reclaimable zone then you could do agressive slab reclaim to remedy
> the situation.
>
That made intuitive sense so I checked it out but doesn't work in practice
even when slabs are aggressively reaped.
> > > And maybe we do not need caches for unreclaimable/unmovable
> > > pages? slab already does its own buffering there.
> > That is true. If it is a problem, what could be done is have a per-cpu
> > cache
> > for movable and unmovable allocations. Then have the __GFP_KERNRCLM
> > allocations bypass the per-cpu allocator altogether and go straight to
> > the
> > buddy allocator.
>
> Right. Maybe we can get away with leaving the pageset cpu caches
> untouched? On our largest systems with 1k nodes 4k cpus we currently have
> 4 zones * 4096 cpus * 1024 nodes = 16 million pagesets. Each of those has
> hot and cold yielding 32 million lists. Now we going triplicate that to
> 192 mio lists and we also increase the size of the structure.
>
I can see the problem with expanding the per-cpu structures. I'll check
out what happens when per-cpu caches are only used for movable
allocations. This is the way things were in an earlier version of
anti-fragmentation but I do not have figures any more.
> With the code currently in 2.6.19 we go from 4 to 2 zones. So we have only
> 16 million pagesets. With the optional DMA in mm we got from 16 to 8
> million pagesets. This effectively undoes the optimizations done in .19
> .20.
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-07 16:30 ` Mel Gorman
@ 2006-11-07 17:54 ` Christoph Lameter
2006-11-07 18:14 ` Mel Gorman
2006-11-13 21:08 ` Mel Gorman
1 sibling, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-07 17:54 UTC (permalink / raw)
To: Mel Gorman
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Tue, 7 Nov 2006, Mel Gorman wrote:
> Hence, I'm still convinced that slab pages for caches like inode and
> short-lived allocations need to be clustered separetly.
So the problem seems to be that some slab of "reclaimable" slabs are
not reclaimable at all even with the most aggressive approach?
Then we have a fundamental issue that we are unable to categorize
pages correctly. EasyReclaimable pages may be unreclaimable because they
are mlocked. Reclaimable (such as slab pages) may turn out to be not
reclaimable because some entries are pinned.
I think we will run into the same issues for EasyReclaim once an
application generates a sufficient amount of mlocked pages that are
placed all over the memory of interest.
Could it be that the only reason that the current approach works is that
we have not tested with an application that behaves this way?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-07 17:54 ` Christoph Lameter
@ 2006-11-07 18:14 ` Mel Gorman
2006-11-08 0:29 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 83+ messages in thread
From: Mel Gorman @ 2006-11-07 18:14 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Tue, 7 Nov 2006, Christoph Lameter wrote:
> On Tue, 7 Nov 2006, Mel Gorman wrote:
>
>> Hence, I'm still convinced that slab pages for caches like inode and
>> short-lived allocations need to be clustered separetly.
>
> So the problem seems to be that some slab of "reclaimable" slabs are
> not reclaimable at all even with the most aggressive approach?
>
Right. You may be able to shrink the slab cache considerably, but still
not empty it. By clustering the pages together, shrinking all the caches
has a chance of freeing up high order pages but there is no guarantee of
course.
> Then we have a fundamental issue that we are unable to categorize
> pages correctly. EasyReclaimable pages may be unreclaimable because they
> are mlocked.
They are migratable though. In the patchset I am currently working on, I
identify pages as Movable, Reclaimable and Unmovable. The redefinitions
are a bit more logical (especially for mlock) and move away from the idea
of page reclaim being the only way of getting high order allocations to
succeed.
> Reclaimable (such as slab pages) may turn out to be not
> reclaimable because some entries are pinned.
>
yep. That will hurt hugepage allocations in those blocks but it should
help allocations required for network cards with large MTUs for example.
> I think we will run into the same issues for EasyReclaim once an
> application generates a sufficient amount of mlocked pages that are
> placed all over the memory of interest.
>
Yep, I agree. At that point, migration will be required but the clustering
will be in place so that moving all the "movable" pages will result in
large contiguous free pages.
> Could it be that the only reason that the current approach works is that
> we have not tested with an application that behaves this way?
>
Probably. The applications I currently test are not mlocking. The tests
currently run workloads that are known to leave the system in a fragmented
state when they complete. In this situation, higher-order allocations fail
even when nothing is running and there are no mlocked() pages on the
standard allocator.
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-07 18:14 ` Mel Gorman
@ 2006-11-08 0:29 ` KAMEZAWA Hiroyuki
2006-11-08 2:08 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-11-08 0:29 UTC (permalink / raw)
To: Mel Gorman; +Cc: clameter, apw, akpm, nickpiggin, linux-mm, a.p.zijlstra
On Tue, 7 Nov 2006 18:14:31 +0000 (GMT)
Mel Gorman <mel@csn.ul.ie> wrote:
> > Could it be that the only reason that the current approach works is that
> > we have not tested with an application that behaves this way?
> >
>
> Probably. The applications I currently test are not mlocking. The tests
> currently run workloads that are known to leave the system in a fragmented
> state when they complete. In this situation, higher-order allocations fail
> even when nothing is running and there are no mlocked() pages on the
> standard allocator.
>
In these days, I've struggled with crashdump from a user to investigate the reason
of oom-kill. At last, the reason was most of 2G bytes ZONE_DMA pages were
mlocked(). Sigh....
I wonder we can use migration of MOVABLE pages for zone balancing in future.
(maybe complicated but...)
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-08 0:29 ` KAMEZAWA Hiroyuki
@ 2006-11-08 2:08 ` Christoph Lameter
0 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-08 2:08 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: Mel Gorman, apw, akpm, nickpiggin, linux-mm, a.p.zijlstra
On Wed, 8 Nov 2006, KAMEZAWA Hiroyuki wrote:
> In these days, I've struggled with crashdump from a user to investigate the reason
> of oom-kill. At last, the reason was most of 2G bytes ZONE_DMA pages were
> mlocked(). Sigh....
> I wonder we can use migration of MOVABLE pages for zone balancing in future.
> (maybe complicated but...)
If we run out of ZONE_DMA memory in the page allocator then scan through
the LRU of ZONE_DMA for pages, call isolate_lru_page() for each page that
you find worthy of moving (all mlocked pages f.e.) and when you have
collected a sufficient quantity call migrate_pages() to get all that are
movable out of ZONE_DMA.
Note though that any writeback of the migrated pages to devices that
require pages <2G will then allocate a bounce buffer for the page.
Seems that you found another reason why it would be useful to get
rid of ZONE_DMA entirely.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-07 16:30 ` Mel Gorman
2006-11-07 17:54 ` Christoph Lameter
@ 2006-11-13 21:08 ` Mel Gorman
1 sibling, 0 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-13 21:08 UTC (permalink / raw)
To: Christoph Lameter
Cc: Andy Whitcroft, Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki,
Linux Memory Management List, Peter Zijlstra
On Tue, 7 Nov 2006, Mel Gorman wrote:
>>
>> Right. Maybe we can get away with leaving the pageset cpu caches
>> untouched? On our largest systems with 1k nodes 4k cpus we currently have
>> 4 zones * 4096 cpus * 1024 nodes = 16 million pagesets. Each of those has
>> hot and cold yielding 32 million lists. Now we going triplicate that to
>> 192 mio lists and we also increase the size of the structure.
>>
>
> I can see the problem with expanding the per-cpu structures. I'll check out
> what happens when per-cpu caches are only used for movable allocations. This
> is the way things were in an earlier version of anti-fragmentation but I do
> not have figures any more.
>
This was harder to get right than expected.
Using the per-cpu allocator for only movable allocations led to
considerable regressions - 2.5% on PPC64 for kbuild and the x86_64 figures
were showing close to 2% regression. AIM9 results showed significant
regressions in places, even on machines that normally give reliable AIM9
results. Non-movable allocations are frequent enough that forcing them to
not use the per-cpu allocator has a noticable impact.
However, I think I have a reasonable compromise. Pages on the per-cpu
lists are not using page->private so the type of page can be stored in
that field (i.e. Movable, Reclaimable, Unmovable). On allocation, the list
is searched and the hotest page of the appropriate type is used, else
rmqueue_bulk() is called. This stops the per-cpu allocator from "leaking"
pages into undesirable areas without requiring larger per-cpu structures.
As care is taken to preserve the hotness of the pages and the page
structures tend to be cache hot anyway, regressions should be very minor
if detectable overall at all. What I've found in initial tests is that
slight increases in time spent in the system are offset by reduced time
spent in userspace so results tend to be within 0.2% of each other.
I'll rebase the patches to the latest -mm tree, run a set of tests to make
sure it's working as expected and post a new set of patches
>> With the code currently in 2.6.19 we go from 4 to 2 zones. So we have only
>> 16 million pagesets. With the optional DMA in mm we got from 16 to 8
>> million pagesets. This effectively undoes the optimizations done in .19
>> .20.
>>
>
> --
> Mel Gorman
> Part-time Phd Student Linux Technology Center
> University of Limerick IBM Dublin Software Lab
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 21:52 ` Christoph Lameter
2006-11-02 22:37 ` Mel Gorman
@ 2006-11-03 12:48 ` Peter Zijlstra
2006-11-03 18:15 ` Christoph Lameter
1 sibling, 1 reply; 83+ messages in thread
From: Peter Zijlstra @ 2006-11-03 12:48 UTC (permalink / raw)
To: Christoph Lameter
Cc: Mel Gorman, Andy Whitcroft, Andrew Morton, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List
On Thu, 2006-11-02 at 13:52 -0800, Christoph Lameter wrote:
> On Thu, 2 Nov 2006, Mel Gorman wrote:
>
> > Ok... list-based anti-frag identified three types of pages. From the leading
> > mail;
> >
> > EasyReclaimable - These are userspace pages that are easily reclaimable. This
> > flag is set when it is known that the pages will be trivially
> > reclaimed
> > by writing the page out to swap or syncing with backing storage
> >
> > KernelReclaimable - These are allocations for some kernel caches that are
> > reclaimable or allocations that are known to be very short-lived.
> >
> > KernelNonReclaimable - These are pages that are allocated by the kernel that
> > are not trivially reclaimed. For example, the memory allocated for a
> > loaded module would be in this category. By default, allocations are
> > considered to be of this type
> >
> > The EasyReclaimable and KernelReclaimable allocations are marked with __GFP
> > flags.
> >
> > Now, you want to separate pages according to movable and unmovable. Broadly
> > speaking, EasyReclaimable == Movable and
> > KernelReclaimable+KernelNonReclaimable == Non-Movable. However, while
> > KernelReclaimable are Non-Movable, they can be reclaimed by purging caches.
> > So, if we redefined the three terms to be Movable, Reclaimable and
> > Non-Movable, you get the separation you are looking for at least within a
> > MAX_ORDER_NR_PAGES.
>
> I think talking about reclaim here is not what you want.
I think it is; all of this only matters at the moment you want to
allocate a large page, at that time you need to reclaim memory to
satisfy the request. (There is some hysteresis between alloc and
reclaim; but lets ignore that for a moment.)
So, the basic operation is reclaim, make it succeed in freeing up the
requested order page (with the least possible disturbance to the rest).
Anti-fragmentation as mel now has it increases the success rate; lumpy
reclaim decreases the collateral damage.
Defrag could contribute to this by moving otherwise un-reclaimable pages
to an lower order free page, so that reclaim of a higher order page can
succeed.
> defragmentation
> is fundamentally about moving memor not reclaim. Reclaim is a way of
> evicting pages from memory to avoid the move. This may be useful if memory
> is filled up because defragging can then do what swapping would have to
> do. However, evicting pages means that they have to be reread. Page
> migration can migrate pages at 1GB/sec which is certainly much higher
> than having to reread the page.
Moving memory about is not the point; although it might come in handy;
its freeing linear chunks of memory without disturbing too much.
> Also I think the reclaim idea breaks down in the following cases:
>
> 1. An mlocked page. This is a page that is movable but not reclaimable.
> How does defrag
NOTE: its anti-fragmentation; not de-fragmentation; the emphasis is on
avoiding fragments; not coalescing them.
> handle that case right now? It should really move the
> page if necessary.
Sure, defrag or rather move_pages() could be rather useful.
> 2. There are a number of unreclaimable page types that are easily movable.
> F.e. page table pages are movable if you take a write-lock on mmap_sem
> and handle the tree carefully. These pages again are not reclaimable but
> they are movable.
>
> Various caching objects in the slab (cpucache align cache etc) are also
> easily movable. If we put them into a separate slab cache then we could
> make them movable.
>
> Certain Device drivers may be able to shut down intermittendly releasing
> their memory and reallocating it later. This also may be used to move
> memory. Memory allocated by such a device driver is movable.
The ability to move pages about that are otherwise unreclaimable does
indeed open up a new class of pages. But moving pages about is not the
main purpose; attaining linear free pages with the least amount of
collateral damage is.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 12:48 ` Peter Zijlstra
@ 2006-11-03 18:15 ` Christoph Lameter
2006-11-03 18:53 ` Peter Zijlstra
0 siblings, 1 reply; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 18:15 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mel Gorman, Andy Whitcroft, Andrew Morton, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List
On Fri, 3 Nov 2006, Peter Zijlstra wrote:
> > I think talking about reclaim here is not what you want.
>
> I think it is; all of this only matters at the moment you want to
> allocate a large page, at that time you need to reclaim memory to
> satisfy the request. (There is some hysteresis between alloc and
> reclaim; but lets ignore that for a moment.)
That is wrong. Dropping pages that will later have to be reread is not
good. It is better to defrag by moving pages.
> So, the basic operation is reclaim, make it succeed in freeing up the
> requested order page (with the least possible disturbance to the rest).
It may lead to rereading of the page.
> Moving memory about is not the point; although it might come in handy;
> its freeing linear chunks of memory without disturbing too much.
Freeing pages that we have to reread?
> Sure, defrag or rather move_pages() could be rather useful.
We have migrate_pages() for exactly that purpose.
> The ability to move pages about that are otherwise unreclaimable does
> indeed open up a new class of pages. But moving pages about is not the
> main purpose; attaining linear free pages with the least amount of
> collateral damage is.
IMHO Moving pages creates less collateral damage than evicting
random pages.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 18:15 ` Christoph Lameter
@ 2006-11-03 18:53 ` Peter Zijlstra
2006-11-03 19:23 ` Christoph Lameter
0 siblings, 1 reply; 83+ messages in thread
From: Peter Zijlstra @ 2006-11-03 18:53 UTC (permalink / raw)
To: Christoph Lameter
Cc: Mel Gorman, Andy Whitcroft, Andrew Morton, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List
On Fri, 2006-11-03 at 10:15 -0800, Christoph Lameter wrote:
> On Fri, 3 Nov 2006, Peter Zijlstra wrote:
>
> > > I think talking about reclaim here is not what you want.
> >
> > I think it is; all of this only matters at the moment you want to
> > allocate a large page, at that time you need to reclaim memory to
> > satisfy the request. (There is some hysteresis between alloc and
> > reclaim; but lets ignore that for a moment.)
>
> That is wrong. Dropping pages that will later have to be reread is not
> good. It is better to defrag by moving pages.
> > The ability to move pages about that are otherwise unreclaimable does
> > indeed open up a new class of pages. But moving pages about is not the
> > main purpose; attaining linear free pages with the least amount of
> > collateral damage is.
>
> IMHO Moving pages creates less collateral damage than evicting
> random pages.
Move them where?, you have to drop pages anyway, the only thing
migrate_pages() (my bad for calling it move_pages()) might help with is
preserving LRU order better and the possibility to move otherwise
unreclaimable pages to a more favourable position (page-tables,
mlock'ed).
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-03 18:53 ` Peter Zijlstra
@ 2006-11-03 19:23 ` Christoph Lameter
0 siblings, 0 replies; 83+ messages in thread
From: Christoph Lameter @ 2006-11-03 19:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mel Gorman, Andy Whitcroft, Andrew Morton, Nick Piggin,
KAMEZAWA Hiroyuki, Linux Memory Management List
On Fri, 3 Nov 2006, Peter Zijlstra wrote:
> > IMHO Moving pages creates less collateral damage than evicting
> > random pages.
>
> Move them where?, you have to drop pages anyway, the only thing
> migrate_pages() (my bad for calling it move_pages()) might help with is
> preserving LRU order better and the possibility to move otherwise
> unreclaimable pages to a more favourable position (page-tables,
> mlock'ed).
If you do not have any memory left then we run reclaim to free pages.
We are dealing here with the issue that we have enough memory free but
it is not contiguous. So we have other places where we could put these
pages.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-01 22:10 ` Mel Gorman
2006-11-02 17:37 ` Andy Whitcroft
@ 2006-11-02 18:52 ` Andrew Morton
2006-11-02 21:51 ` Mel Gorman
2006-11-02 22:03 ` Andy Whitcroft
1 sibling, 2 replies; 83+ messages in thread
From: Andrew Morton @ 2006-11-02 18:52 UTC (permalink / raw)
To: Mel Gorman
Cc: Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki,
Andy Whitcroft, Linux Memory Management List
On Wed, 1 Nov 2006 22:10:02 +0000 (GMT)
Mel Gorman <mel@csn.ul.ie> wrote:
> On Wed, 1 Nov 2006, Andrew Morton wrote:
>
> > On Wed, 1 Nov 2006 18:26:05 +0000
> > mel@skynet.ie (Mel Gorman) wrote:
> >
> >> I never really got this objection. With list-based anti-frag, the
> >> zone-balancing logic remains the same. There are patches from Andy
> >> Whitcroft that reclaims pages in contiguous blocks, but still with the same
> >> zone-ordering. It doesn't affect load balancing between zones as such.
> >
> > I do believe that lumpy-reclaim (initiated by Andy, redone and prototyped
> > by Peter, cruelly abandoned) is a perferable approach to solving the
> > fragmentation approach.
> >
>
> On it's own lumpy-reclaim or linear-reclaim were not enough to get
> MAX_ORDER_NR_PAGES blocks of contiguous pages and these were of interest
> for huge pages although not necessarily of much use to memory hot-unplug.
I'm interested in lumpy-reclaim as a simple solution to the
e1000-cant-allocate-an-order-2-page problem, rather than for hugepages.
ie: a bugfix, not a feature..
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 18:52 ` Andrew Morton
@ 2006-11-02 21:51 ` Mel Gorman
2006-11-02 22:03 ` Andy Whitcroft
1 sibling, 0 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-02 21:51 UTC (permalink / raw)
To: Andrew Morton
Cc: Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki,
Andy Whitcroft, Linux Memory Management List
On Thu, 2 Nov 2006, Andrew Morton wrote:
> On Wed, 1 Nov 2006 22:10:02 +0000 (GMT)
> Mel Gorman <mel@csn.ul.ie> wrote:
>
>> On Wed, 1 Nov 2006, Andrew Morton wrote:
>>
>>> On Wed, 1 Nov 2006 18:26:05 +0000
>>> mel@skynet.ie (Mel Gorman) wrote:
>>>
>>>> I never really got this objection. With list-based anti-frag, the
>>>> zone-balancing logic remains the same. There are patches from Andy
>>>> Whitcroft that reclaims pages in contiguous blocks, but still with the same
>>>> zone-ordering. It doesn't affect load balancing between zones as such.
>>>
>>> I do believe that lumpy-reclaim (initiated by Andy, redone and prototyped
>>> by Peter, cruelly abandoned) is a perferable approach to solving the
>>> fragmentation approach.
>>>
>>
>> On it's own lumpy-reclaim or linear-reclaim were not enough to get
>> MAX_ORDER_NR_PAGES blocks of contiguous pages and these were of interest
>> for huge pages although not necessarily of much use to memory hot-unplug.
>
> I'm interested in lumpy-reclaim as a simple solution to the
> e1000-cant-allocate-an-order-2-page problem, rather than for hugepages.
>
> ie: a bugfix, not a feature..
>
Ah... right.
well, lumpy reclaim is still taking pages from the LRU so they are
EasyReclaimable pages. You want some sort of chance that other
EasyReclaimable pages are adjacent to it and anti-frag should increase
your chances significantly.
I already hear "ah, but you start fragmenting then". This is true, but the
assumption is that these high-order allocations are relatively
short-lived. An order-2 allocation for network buffers will fall back into
an EasyReclaimable area but in the longer term, it shouldn't matter.
Additionally, with anti-frag, it *might* make sense to start refilling the
per-cpu caches with high-order allocations that are split up instead of
pcp->batch order-0 allocations. This should increase the chances of
adjacent pages being freed up within the KernelReclaimable areas as well
as the EasyReclaimable areas which should help the e1000 problem.
It is hard to prove definitively but here is the reasoning. Assuming
network buffers and cache allocations are marked KernelReclaimable, they
will tend to cluster together in MAX_ORDER_NR_PAGES blocks with anti-frag.
Cache allocations tend to be order-0 so are satisfied from the per-cpu
caches so they tend to be adjacent if the per-cpu caches are filled in
batch. High-order allocations will also tend to be clustered together
because of the way buddy splitting works. So, if the network is busy and
allocating buffers, there would have to be really significant memory
pressure before other allocations start using up the high-order free
blocks in the KernelReclaimable MAX_ORDER_NR_PAGES blocks.
This needs a workload that hits that e1000 problem reliably to figure out.
Do you have any suggestion?
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 18:52 ` Andrew Morton
2006-11-02 21:51 ` Mel Gorman
@ 2006-11-02 22:03 ` Andy Whitcroft
2006-11-02 22:11 ` Andrew Morton
1 sibling, 1 reply; 83+ messages in thread
From: Andy Whitcroft @ 2006-11-02 22:03 UTC (permalink / raw)
To: Andrew Morton
Cc: Mel Gorman, Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki,
Linux Memory Management List
Andrew Morton wrote:
> On Wed, 1 Nov 2006 22:10:02 +0000 (GMT)
> Mel Gorman <mel@csn.ul.ie> wrote:
>
>> On Wed, 1 Nov 2006, Andrew Morton wrote:
>>
>>> On Wed, 1 Nov 2006 18:26:05 +0000
>>> mel@skynet.ie (Mel Gorman) wrote:
>>>
>>>> I never really got this objection. With list-based anti-frag, the
>>>> zone-balancing logic remains the same. There are patches from Andy
>>>> Whitcroft that reclaims pages in contiguous blocks, but still with the same
>>>> zone-ordering. It doesn't affect load balancing between zones as such.
>>> I do believe that lumpy-reclaim (initiated by Andy, redone and prototyped
>>> by Peter, cruelly abandoned) is a perferable approach to solving the
>>> fragmentation approach.
>>>
>> On it's own lumpy-reclaim or linear-reclaim were not enough to get
>> MAX_ORDER_NR_PAGES blocks of contiguous pages and these were of interest
>> for huge pages although not necessarily of much use to memory hot-unplug.
>
> I'm interested in lumpy-reclaim as a simple solution to the
> e1000-cant-allocate-an-order-2-page problem, rather than for hugepages.
>
> ie: a bugfix, not a feature..
Is there a description of the problem and particularly of the
allocation patterns here. Particularly key is the level
of memory pressure when we are allocating these higher orders.
Lumpy reclaim and less so Linear reclaim is less effective when
memory pressure is severe so we may not see the hoped for benefit.
Most of the benchmarking we have done is for higher order pages
and this effect may well be less at lower order.
-apw
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-11-02 22:03 ` Andy Whitcroft
@ 2006-11-02 22:11 ` Andrew Morton
0 siblings, 0 replies; 83+ messages in thread
From: Andrew Morton @ 2006-11-02 22:11 UTC (permalink / raw)
To: Andy Whitcroft
Cc: Mel Gorman, Nick Piggin, Christoph Lameter, KAMEZAWA Hiroyuki,
Linux Memory Management List
On Thu, 02 Nov 2006 22:03:30 +0000
Andy Whitcroft <apw@shadowen.org> wrote:
> >> On it's own lumpy-reclaim or linear-reclaim were not enough to get
> >> MAX_ORDER_NR_PAGES blocks of contiguous pages and these were of interest
> >> for huge pages although not necessarily of much use to memory hot-unplug.
> >
> > I'm interested in lumpy-reclaim as a simple solution to the
> > e1000-cant-allocate-an-order-2-page problem, rather than for hugepages.
> >
> > ie: a bugfix, not a feature..
>
>
> Is there a description of the problem and particularly of the
> allocation patterns here.
I guess we see maybe a couple of reports a month. The driver tries to
allocate an order-2 patch from atomic context and there aren't any so a
warning gets spat out and people complain. The usual "fix" is to increase
min_free_kbytes. Try executing google(e1000 min_free_kbytes);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-28 2:31 ` Christoph Lameter
2006-10-28 4:43 ` Andrew Morton
@ 2006-11-01 18:13 ` Mel Gorman
1 sibling, 0 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-01 18:13 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andrew Morton, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On (27/10/06 19:31), Christoph Lameter didst pronounce:
> On Fri, 27 Oct 2006, Andrew Morton wrote:
>
> > We need some way of preventing unreclaimable kernel memory allocations from
> > using certain physical pages. That means zones.
>
> Well then we may need zones for defragmentation and zeroed pages as well
> etc etc. The problem is that such things make the VM much more
> complex and not simpler and faster.
>
You don't need new zones for defragmentation and pre-zeroed pages. I reposted
the anti-fragmentation patches which create sub-zone-freelists for pages
of each type of reclaimability. Previously, an additional list existed for
prezerod pages but I don't think I ever showed a performance improvement
with them so I dropped them after a while.
> > > Memory hot unplug
> > > seems to have been dropped in favor of baloons.
> >
> > Has it? I don't recall seeing a vague proposal, let alone an implementation?
>
> That is the impression that I got at the OLS. There were lots of talks
> about baloons approaches.
>
Memory hot-unplug is not quite dead but there not everything existed that
was required to really make it work. The most obvious problem was that kernel
allocations were in the middle of the region you were trying to unplug. The
anti-fragmentation patches introduce a __GFP_EASYRCLM flag that can be used
to flag allocations that can be really reclaimed.
Patches also exist to create a zone for hot-unplug but sizing it at boot
time was a total mess. This is a lot easier with architecture-independent
zone-sizing and I can bring forward some patches if people want to take a
look. However, no infrastrcture exists for moving memory between zones or
choosing what zone to hot-add memory to.
Power at least is able to hot-remove a MAX_ORDER_NR_PAGES block of pages and
give it back to the hypervisor (AFAIK, could be wrong) but fragmentation was
a problem. List-based anti-fragmentation was shown a long time ago to improve
the success rates of a memory-unplug but I haven't tried in a long time.
> > Userspace allocations are reclaimable: pagecache, anonymous memory. These
> > happen to be allocated with __GFP_HIGHMEM set.
>
> On certain platforms yes.
>
The list-based anti-fragmentation patches flag the really-reclaimable
allocations as __GFP_EASYRCLM regardless of what zone they are allocated
from. mlock() is a problem but page migration could address it.
> > So right now __GFP_HIGHMEM is an excellent hint telling the page allocator
> > that it is safe to satisfy this request from removeable memory.
>
> OK this works on i386 but most other platforms wont have a highmem
> zone.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: Page allocator: Single Zone optimizations
2006-10-26 22:09 ` Andrew Morton
2006-10-26 22:28 ` Christoph Lameter
2006-10-28 1:00 ` Christoph Lameter
@ 2006-11-01 17:39 ` Mel Gorman
2 siblings, 0 replies; 83+ messages in thread
From: Mel Gorman @ 2006-11-01 17:39 UTC (permalink / raw)
To: Andrew Morton; +Cc: Christoph Lameter, Nick Piggin, KAMEZAWA Hiroyuki, linux-mm
On (26/10/06 15:09), Andrew Morton didst pronounce:
> On Mon, 23 Oct 2006 16:08:20 -0700 (PDT)
> Christoph Lameter <clameter@sgi.com> wrote:
>
> > Single Zone Optimizations V2
> >
> > V1->V2 Use a config variable setup im mm/KConfig
> >
> > If we only have a single zone then various macros can be optimized.
> > We do not need to protect higher zones, we know that zones are
> > always present, can remove useless data from /proc etc etc. Various
> > code paths become unnecessary with a single zone setup.
>
> I don't know about all of this. It's making core mm increasingly revolting
> and increases dissimilarities between different kernel builds and generally
> makes it harder for us to remotely diagnose and solve people's bug reports.
> Harder to understand architecture A's behaviour based upon one's knowledge
> of architecture B, etc.
>
> I really really want to drop all those patches[1] and rethink it all.
>
> Like... would it make sense to eliminate the hard-coded concepts of DMA,
> DMA32, NORMAL and HIGHMEM and simply say "we support 1 to N zones" per
> node? Obviously we'd need to keep the DMA/NORMAL/HIGHMEM nomenclature in
> the interfaces so the rest of the kernel builds and works, but the core mm
> just shouldn't need to care: all it cares about is one or more zones.
>
This feels vaguely similar to http://lkml.org/lkml/2001/6/7/117 . The
basic idea is that zones would be dynamically created at runtime and the
allowable GFP flags would be registered for a zone and zonelists built
based on that. I'm not saying this is the right thing to do, but it's not
the first time this has come up for one reason or another.
>
>
> Or something like that. Something which makes the mm easier to understand,
> easier to maintain and faster. Rather than harder to understand, harder to
> maintain and faster.
>
>
>
>
>
>
> [1] These:
>
> get-rid-of-zone_table.patch
> deal-with-cases-of-zone_dma-meaning-the-first-zone.patch
> get-rid-of-zone_table-fix-3.patch
> introduce-config_zone_dma.patch
> optional-zone_dma-in-the-vm.patch
> optional-zone_dma-in-the-vm-no-gfp_dma-check-in-the-slab-if-no-config_zone_dma-is-set.patch
> optional-zone_dma-for-ia64.patch
> remove-zone_dma-remains-from-parisc.patch
> remove-zone_dma-remains-from-sh-sh64.patch
> set-config_zone_dma-for-arches-with-generic_isa_dma.patch
> zoneid-fix-up-calculations-for-zoneid_pgshift.patch
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 83+ messages in thread
end of thread, other threads:[~2006-11-13 21:08 UTC | newest]
Thread overview: 83+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-10-17 0:50 Page allocator: Single Zone optimizations Christoph Lameter
2006-10-17 1:10 ` Andrew Morton
2006-10-17 1:13 ` Christoph Lameter
2006-10-17 1:27 ` KAMEZAWA Hiroyuki
2006-10-17 1:25 ` Christoph Lameter
2006-10-17 6:04 ` Nick Piggin
2006-10-17 17:54 ` Christoph Lameter
2006-10-18 11:15 ` Nick Piggin
2006-10-18 19:38 ` Andrew Morton
2006-10-23 23:08 ` Christoph Lameter
2006-10-24 1:07 ` Christoph Lameter
2006-10-26 22:09 ` Andrew Morton
2006-10-26 22:28 ` Christoph Lameter
2006-10-28 1:00 ` Christoph Lameter
2006-10-28 2:04 ` Andrew Morton
2006-10-28 2:12 ` Christoph Lameter
2006-10-28 2:24 ` Andrew Morton
2006-10-28 2:31 ` Christoph Lameter
2006-10-28 4:43 ` Andrew Morton
2006-10-28 7:47 ` KAMEZAWA Hiroyuki
2006-10-28 16:12 ` Andi Kleen
2006-10-29 0:48 ` Christoph Lameter
2006-10-29 1:04 ` Andrew Morton
2006-10-29 1:29 ` Christoph Lameter
2006-10-29 11:32 ` Nick Piggin
2006-10-30 16:41 ` Christoph Lameter
2006-11-01 18:26 ` Mel Gorman
2006-11-01 20:34 ` Andrew Morton
2006-11-01 21:00 ` Christoph Lameter
2006-11-01 21:46 ` Andrew Morton
2006-11-01 21:50 ` Christoph Lameter
2006-11-01 22:13 ` Mel Gorman
2006-11-01 23:29 ` Christoph Lameter
2006-11-02 0:22 ` Andrew Morton
2006-11-02 0:27 ` Christoph Lameter
2006-11-02 12:45 ` Mel Gorman
2006-11-01 22:10 ` Mel Gorman
2006-11-02 17:37 ` Andy Whitcroft
2006-11-02 18:08 ` Christoph Lameter
2006-11-02 20:58 ` Mel Gorman
2006-11-02 21:04 ` Christoph Lameter
2006-11-02 21:16 ` Mel Gorman
2006-11-02 21:52 ` Christoph Lameter
2006-11-02 22:37 ` Mel Gorman
2006-11-02 22:50 ` Christoph Lameter
2006-11-03 9:14 ` Mel Gorman
2006-11-03 13:17 ` Andy Whitcroft
2006-11-03 18:11 ` Christoph Lameter
2006-11-03 19:06 ` Mel Gorman
2006-11-03 19:44 ` Christoph Lameter
2006-11-03 21:11 ` Mel Gorman
2006-11-03 21:42 ` Christoph Lameter
2006-11-03 21:50 ` Andrew Morton
2006-11-03 21:53 ` Christoph Lameter
2006-11-03 22:12 ` Andrew Morton
2006-11-03 22:15 ` Christoph Lameter
2006-11-03 22:19 ` Andi Kleen
2006-11-04 0:37 ` Christoph Lameter
2006-11-04 1:32 ` Andi Kleen
2006-11-06 16:40 ` Christoph Lameter
2006-11-06 16:56 ` Andi Kleen
2006-11-06 17:00 ` Christoph Lameter
2006-11-06 17:07 ` Andi Kleen
2006-11-06 17:12 ` Hugh Dickins
2006-11-06 17:15 ` Christoph Lameter
2006-11-06 17:20 ` Andi Kleen
2006-11-06 17:26 ` Christoph Lameter
2006-11-07 16:30 ` Mel Gorman
2006-11-07 17:54 ` Christoph Lameter
2006-11-07 18:14 ` Mel Gorman
2006-11-08 0:29 ` KAMEZAWA Hiroyuki
2006-11-08 2:08 ` Christoph Lameter
2006-11-13 21:08 ` Mel Gorman
2006-11-03 12:48 ` Peter Zijlstra
2006-11-03 18:15 ` Christoph Lameter
2006-11-03 18:53 ` Peter Zijlstra
2006-11-03 19:23 ` Christoph Lameter
2006-11-02 18:52 ` Andrew Morton
2006-11-02 21:51 ` Mel Gorman
2006-11-02 22:03 ` Andy Whitcroft
2006-11-02 22:11 ` Andrew Morton
2006-11-01 18:13 ` Mel Gorman
2006-11-01 17:39 ` Mel Gorman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox