* [RFC][PATCH] syctl for selecting global zonelist[] order
@ 2007-04-25 3:19 KAMEZAWA Hiroyuki
2007-04-25 7:42 ` Andrew Morton
2007-04-25 19:17 ` Christoph Lameter
0 siblings, 2 replies; 7+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-25 3:19 UTC (permalink / raw)
To: LKML; +Cc: Linux-MM, GOTO
Make zonelist policy selectable from sysctl.
Assume 2 node NUMA, only node(0) has ZONE_DMA (ZONE_DMA32).
In this case, default (node0's) zonelist order is
Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.
This means Node(0)'s DMA is used before Node(1)'s NORMAL.
In some server, some application uses large memory allcation.
This exhaust memory in the above order.
Then....sometimes OOM_KILL will occur when 32bit device requires memory.
This patch adds sysctl for rebuilding zonelist after boot and doesn't change
default zonelist order.
command:
%echo 0 > /proc/sys/vm/better_locality
Will rebuild zonelist in following order.
Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.
if set better_locality == 1 (default), zonelist is
Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)'s NORMAL.
Maybe useful in some users with heavy memory pressure and mlocks.
Tested under ia64 2 node NUMA against 2.6.21-rc7.. works well.
Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Index: linux-2.6.21-rc7/kernel/sysctl.c
===================================================================
--- linux-2.6.21-rc7.orig/kernel/sysctl.c
+++ linux-2.6.21-rc7/kernel/sysctl.c
@@ -76,6 +76,9 @@ extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
+#ifdef CONFIG_NUMA
+extern int sysctl_better_locality;
+#endif
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -845,6 +848,15 @@ static ctl_table vm_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .ctl_name = VM_BETTER_LOCALITY,
+ .procname = "better_locality",
+ .data = &sysctl_better_locality,
+ .maxlen = sizeof(sysctl_better_locality),
+ .mode = 0644,
+ .proc_handler = &sysctl_better_locality_handler,
+ .strategy = &sysctl_intvec,
+ },
#endif
#if defined(CONFIG_X86_32) || \
(defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
Index: linux-2.6.21-rc7/mm/page_alloc.c
===================================================================
--- linux-2.6.21-rc7.orig/mm/page_alloc.c
+++ linux-2.6.21-rc7/mm/page_alloc.c
@@ -1670,7 +1670,7 @@ static int __meminit build_zonelists_nod
#ifdef CONFIG_NUMA
#define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
/**
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
@@ -1685,7 +1685,7 @@ static int __meminitdata node_load[MAX_N
* on them otherwise.
* It returns -1 if no node is found.
*/
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
int n, val;
int min_val = INT_MAX;
@@ -1731,7 +1731,10 @@ static int __meminit find_next_best_node
return best_node;
}
-static void __meminit build_zonelists(pg_data_t *pgdat)
+/*
+ * Build zonelists based on node locality.
+ */
+static void build_zonelists_locality_aware(pg_data_t *pgdat)
{
int j, node, local_node;
enum zone_type i;
@@ -1780,6 +1783,78 @@ static void __meminit build_zonelists(pg
}
}
+/*
+ * Build zonelist based on zone priority.
+ */
+static int node_order[MAX_NUMNODES];
+static void build_zonelists_zone_aware(pg_data_t *pgdat)
+{
+ int i, j, pos, zone_type, node, load;
+ nodemask_t used_mask;
+ int local_node, prev_node;
+ struct zone *z;
+ struct zonelist *zonelist;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ zonelist->zones[0] = NULL;
+ }
+ memset(node_order, 0, sizeof(node_order));
+ local_node = pgdat->node_id;
+ load = num_online_nodes();
+ prev_node = local_node;
+ nodes_clear(used_mask);
+ j = 0;
+ while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
+ if (distance > RECLAIM_DISTANCE)
+ zone_reclaim_mode = 1;
+ if (distance != node_distance(local_node, prev_node))
+ node_load[node] = load;
+ node_order[j++] = node;
+ prev_node = node;
+ load--;
+ }
+ /* calculate node order */
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ zonelist = pgdat->node_zonelists + i;
+ pos = 0;
+ for (zone_type = i; zone_type >= 0; zone_type--) {
+ for (j = 0; j < num_online_nodes(); j++) {
+ node = node_order[j];
+ z = &NODE_DATA(node)->node_zones[zone_type];
+ if (populated_zone(z))
+ zonelist->zones[pos++] = z;
+ }
+ }
+ zonelist->zones[pos] = NULL;
+ }
+}
+
+int sysctl_better_locality = 1;
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+ if (sysctl_better_locality) {
+ build_zonelists_locality_aware(pgdat);
+ } else {
+ build_zonelists_zone_aware(pgdat);
+ }
+}
+
+int sysctl_better_locality_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length,
+ loff_t *ppos)
+{
+ int oldval = sysctl_better_locality;
+ proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ if (write) {
+ if (oldval != sysctl_better_locality)
+ build_all_zonelists();
+ }
+ return 0;
+}
+
/* Construct the zonelist performance cache - see further mmzone.h */
static void __meminit build_zonelist_cache(pg_data_t *pgdat)
{
@@ -1847,7 +1922,7 @@ static void __meminit build_zonelist_cac
#endif /* CONFIG_NUMA */
/* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
{
int nid;
@@ -1858,12 +1933,13 @@ static int __meminit __build_all_zonelis
return 0;
}
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
{
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
} else {
+ memset(node_load, 0, sizeof(node_load));
/* we have to stop all cpus to guaranntee there is no user
of zonelist */
stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
Index: linux-2.6.21-rc7/include/linux/mmzone.h
===================================================================
--- linux-2.6.21-rc7.orig/include/linux/mmzone.h
+++ linux-2.6.21-rc7/include/linux/mmzone.h
@@ -563,6 +563,9 @@ int sysctl_min_unmapped_ratio_sysctl_han
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
struct file *, void __user *, size_t *, loff_t *);
+extern int sysctl_better_locality_handler(struct ctl_table *, int,
+ struct file *, void __user *, size_t *, loff_t *);
+
#include <linux/topology.h>
/* Returns the number of the current Node. */
#ifndef numa_node_id
Index: linux-2.6.21-rc7/include/linux/sysctl.h
===================================================================
--- linux-2.6.21-rc7.orig/include/linux/sysctl.h
+++ linux-2.6.21-rc7/include/linux/sysctl.h
@@ -207,6 +207,7 @@ enum
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_BETTER_LOCALITY=36, /* create locality-preference zonelist */
/* s390 vm cmm sysctls */
VM_CMM_PAGES=1111,
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [RFC][PATCH] syctl for selecting global zonelist[] order
2007-04-25 3:19 [RFC][PATCH] syctl for selecting global zonelist[] order KAMEZAWA Hiroyuki
@ 2007-04-25 7:42 ` Andrew Morton
2007-04-25 7:55 ` KAMEZAWA Hiroyuki
2007-04-25 9:31 ` Andi Kleen
2007-04-25 19:17 ` Christoph Lameter
1 sibling, 2 replies; 7+ messages in thread
From: Andrew Morton @ 2007-04-25 7:42 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: LKML, Linux-MM, GOTO
On Wed, 25 Apr 2007 12:19:46 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> Make zonelist policy selectable from sysctl.
>
> Assume 2 node NUMA, only node(0) has ZONE_DMA (ZONE_DMA32).
>
> In this case, default (node0's) zonelist order is
>
> Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.
>
> This means Node(0)'s DMA is used before Node(1)'s NORMAL.
>
> In some server, some application uses large memory allcation.
> This exhaust memory in the above order.
> Then....sometimes OOM_KILL will occur when 32bit device requires memory.
>
> This patch adds sysctl for rebuilding zonelist after boot and doesn't change
> default zonelist order.
hm. Why don't we use that ordering all the time? Does the present ordering have
any advantage?
> command:
> %echo 0 > /proc/sys/vm/better_locality
Who could resist having better locality? ;)
> Will rebuild zonelist in following order.
>
> Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.
>
> if set better_locality == 1 (default), zonelist is
> Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)'s NORMAL.
>
> Maybe useful in some users with heavy memory pressure and mlocks.
>
> ...
>
> extern int percpu_pagelist_fraction;
> extern int compat_log;
> +#ifdef CONFIG_NUMA
> +extern int sysctl_better_locality;
> +#endif
The ifdef isn't needed here. If something went wrong, we'll find out at
link-time.
> /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
> static int maxolduid = 65535;
> @@ -845,6 +848,15 @@ static ctl_table vm_table[] = {
> .extra1 = &zero,
> .extra2 = &one_hundred,
> },
> + {
> + .ctl_name = VM_BETTER_LOCALITY,
Please don't add new sysctls: use CTL_UNNUMBERED here.
> + .procname = "better_locality",
> + .data = &sysctl_better_locality,
> + .maxlen = sizeof(sysctl_better_locality),
> + .mode = 0644,
> + .proc_handler = &sysctl_better_locality_handler,
> + .strategy = &sysctl_intvec,
> + },
>
> ..
>
> +static void build_zonelists(pg_data_t *pgdat)
> +{
> + if (sysctl_better_locality) {
> + build_zonelists_locality_aware(pgdat);
> + } else {
> + build_zonelists_zone_aware(pgdat);
> + }
Remove all the braces please.
> @@ -207,6 +207,7 @@ enum
> VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
> VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
> VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
> + VM_BETTER_LOCALITY=36, /* create locality-preference zonelist */
This can go away.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [RFC][PATCH] syctl for selecting global zonelist[] order
2007-04-25 7:42 ` Andrew Morton
@ 2007-04-25 7:55 ` KAMEZAWA Hiroyuki
2007-04-25 9:31 ` Andi Kleen
1 sibling, 0 replies; 7+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-25 7:55 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, linux-mm, y-goto
On Wed, 25 Apr 2007 00:42:14 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 25 Apr 2007 12:19:46 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> > Make zonelist policy selectable from sysctl.
> >
> > Assume 2 node NUMA, only node(0) has ZONE_DMA (ZONE_DMA32).
> >
> > In this case, default (node0's) zonelist order is
> >
> > Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.
> >
> > This means Node(0)'s DMA is used before Node(1)'s NORMAL.
> >
> > In some server, some application uses large memory allcation.
> > This exhaust memory in the above order.
> > Then....sometimes OOM_KILL will occur when 32bit device requires memory.
> >
> > This patch adds sysctl for rebuilding zonelist after boot and doesn't change
> > default zonelist order.
>
> hm. Why don't we use that ordering all the time? Does the present ordering have
> any advantage?
>
I don't know ;) maybe some high-end NUMA hardware has IOMMU and
zoning by memory address has no meaning.
> > command:
> > %echo 0 > /proc/sys/vm/better_locality
>
> Who could resist having better locality? ;)
>
how about changing this name to strict_zone_order and
if strict_zone_order = 1
Node(0)'NORMAL -> Node(1)'Normal -> Node(0)'DMA
if strict_zone_order = 0
Node(0)'NORMAL -> Node(0)'DMA -> Node(1)'NORMAL
If someone thinks of better name, please teach me.
> > extern int percpu_pagelist_fraction;
> > extern int compat_log;
> > +#ifdef CONFIG_NUMA
> > +extern int sysctl_better_locality;
> > +#endif
>
> The ifdef isn't needed here. If something went wrong, we'll find out at
> link-time.
>
Okay.
> > /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
> > static int maxolduid = 65535;
> > @@ -845,6 +848,15 @@ static ctl_table vm_table[] = {
> > .extra1 = &zero,
> > .extra2 = &one_hundred,
> > },
> > + {
> > + .ctl_name = VM_BETTER_LOCALITY,
>
> Please don't add new sysctls: use CTL_UNNUMBERED here.
>
Oh, I didn't know about CTL_UNNUMBERED. looks useful. I'll try.
> > +static void build_zonelists(pg_data_t *pgdat)
> > +{
> > + if (sysctl_better_locality) {
> > + build_zonelists_locality_aware(pgdat);
> > + } else {
> > + build_zonelists_zone_aware(pgdat);
> > + }
>
> Remove all the braces please.
Okay.
>
> > @@ -207,6 +207,7 @@ enum
> > VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
> > VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
> > VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
> > + VM_BETTER_LOCALITY=36, /* create locality-preference zonelist */
>
> This can go away.
>
Okay.
I'll wait for other replies and post updated one tomorrow.
Thank you,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [RFC][PATCH] syctl for selecting global zonelist[] order
2007-04-25 7:42 ` Andrew Morton
2007-04-25 7:55 ` KAMEZAWA Hiroyuki
@ 2007-04-25 9:31 ` Andi Kleen
1 sibling, 0 replies; 7+ messages in thread
From: Andi Kleen @ 2007-04-25 9:31 UTC (permalink / raw)
To: Andrew Morton; +Cc: KAMEZAWA Hiroyuki, LKML, Linux-MM, GOTO
> hm. Why don't we use that ordering all the time? Does the present ordering have
> any advantage?
At least on x86-64 it would make sense to change this always
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC][PATCH] syctl for selecting global zonelist[] order
2007-04-25 3:19 [RFC][PATCH] syctl for selecting global zonelist[] order KAMEZAWA Hiroyuki
2007-04-25 7:42 ` Andrew Morton
@ 2007-04-25 19:17 ` Christoph Lameter
2007-04-26 0:31 ` KAMEZAWA Hiroyuki
1 sibling, 1 reply; 7+ messages in thread
From: Christoph Lameter @ 2007-04-25 19:17 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: LKML, Linux-MM, GOTO
On Wed, 25 Apr 2007, KAMEZAWA Hiroyuki wrote:
> Make zonelist policy selectable from sysctl.
>
> Assume 2 node NUMA, only node(0) has ZONE_DMA (ZONE_DMA32).
>
> In this case, default (node0's) zonelist order is
>
> Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.
>
> This means Node(0)'s DMA is used before Node(1)'s NORMAL.
So a IA64 platform with i386 sicknesses? And pretty bad case of it since I
assume that the memory sizes per node are equal. Your solution of taking
4G off node 0 and then going to node 1 first must hurt some
processes running on node 0. But there is no easy solution since
the hardware is badly screwed up with 32 bit I/O. Whatever you do the
memory balance between the two nodes is making the system behave in
an unsymmetric way.
> In some server, some application uses large memory allcation.
> This exhaust memory in the above order.
Could we add a boot time option instead that changes the zonelist build
behavior? Maybe an arch hook that can deal with it?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC][PATCH] syctl for selecting global zonelist[] order
2007-04-25 19:17 ` Christoph Lameter
@ 2007-04-26 0:31 ` KAMEZAWA Hiroyuki
2007-04-26 0:40 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 7+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-26 0:31 UTC (permalink / raw)
To: Christoph Lameter; +Cc: linux-kernel, linux-mm, y-goto
On Wed, 25 Apr 2007 12:17:15 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:
> On Wed, 25 Apr 2007, KAMEZAWA Hiroyuki wrote:
>
> > Make zonelist policy selectable from sysctl.
> >
> > Assume 2 node NUMA, only node(0) has ZONE_DMA (ZONE_DMA32).
> >
> > In this case, default (node0's) zonelist order is
> >
> > Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.
> >
> > This means Node(0)'s DMA is used before Node(1)'s NORMAL.
>
> So a IA64 platform with i386 sicknesses? And pretty bad case of it since I
> assume that the memory sizes per node are equal. Your solution of taking
> 4G off node 0 and then going to node 1 first must hurt some
> processes running on node 0.
I think so, too. It is because I made this as selectable option.
> Whatever you do the memory balance between the two nodes is making
> the system behave in an unsymmetric way.
> > In some server, some application uses large memory allcation.
> > This exhaust memory in the above order.
>
> Could we add a boot time option instead that changes the zonelist build
> behavior? Maybe an arch hook that can deal with it?
>
Yes, it' in my plan. I'll add boot option support.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [RFC][PATCH] syctl for selecting global zonelist[] order
2007-04-26 0:31 ` KAMEZAWA Hiroyuki
@ 2007-04-26 0:40 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 7+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-04-26 0:40 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: clameter, linux-kernel, linux-mm, y-goto
On Thu, 26 Apr 2007 09:31:12 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> >
> > So a IA64 platform with i386 sicknesses? And pretty bad case of it since I
> > assume that the memory sizes per node are equal. Your solution of taking
> > 4G off node 0 and then going to node 1 first must hurt some
> > processes running on node 0.
> I think so, too. It is because I made this as selectable option.
^^^^^^^^^
why...
sorry.
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2007-04-26 0:40 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-04-25 3:19 [RFC][PATCH] syctl for selecting global zonelist[] order KAMEZAWA Hiroyuki
2007-04-25 7:42 ` Andrew Morton
2007-04-25 7:55 ` KAMEZAWA Hiroyuki
2007-04-25 9:31 ` Andi Kleen
2007-04-25 19:17 ` Christoph Lameter
2007-04-26 0:31 ` KAMEZAWA Hiroyuki
2007-04-26 0:40 ` KAMEZAWA Hiroyuki
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox