[PATCH] Add populated_map to account for memoryless nodes

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] Add populated_map to account for memoryless nodes
@ 2007-06-11 20:27 Nishanth Aravamudan, Lee Schermerhorn
  2007-06-11 21:25 ` Christoph Lameter
  2007-06-12  2:27 ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan, Lee Schermerhorn @ 2007-06-11 20:27 UTC (permalink / raw)
  To: clameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

Split up Lee and Anton's original patch
(http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
populated_map changes to go in on their own.

Add a populated_map nodemask to indicate a node has memory or not. We
have run into a number of issues (in practice and in code) with
assumptions about every node having memory. Having this nodemask allows
us to fix these issues; in particular, THISNODE allocations will come
from the node specified, only, and the INTERLEAVE policy will be able to
do the right thing with memoryless nodes.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

---

Andrew, sorry, typo'd your address in the previous series. Will be
sending them out again, anyways...

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 52c54a5..751d3d7 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -64,12 +64,16 @@
  *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
+ * int node_populated(node)		Is some node populated [at policy_zone]
  *
  * int any_online_node(mask)		First online node in mask
  *
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_populated(node)		set bit 'node' in node_populated_map
+ * node_not_populated(node)		clear bit 'node' in node_populated_map
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 
 extern nodemask_t node_online_map;
 extern nodemask_t node_possible_map;
+extern nodemask_t node_populated_map;
 
 #if MAX_NUMNODES > 1
 #define num_online_nodes()	nodes_weight(node_online_map)
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define node_populated(node)	node_isset((node), node_populated_map)
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
@@ -358,6 +364,7 @@ extern int nr_node_ids;
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define node_populated(node)	((node) == 0)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
@@ -375,6 +382,9 @@ extern int nr_node_ids;
 #define node_set_online(node)	   set_bit((node), node_online_map.bits)
 #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
 
+#define node_set_populated(node)   set_bit((node), node_populated_map.bits)
+#define node_not_populated(node)   clear_bit((node), node_populated_map.bits)
+
 #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
 #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07cd5ae..8eea4ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_populated_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_populated_map);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2161,7 +2164,7 @@ static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	enum zone_type i;
-	int pos, j, node;
+	int pos, j;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
@@ -2171,7 +2174,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 		pos = 0;
 		for (zone_type = i; zone_type >= 0; zone_type--) {
 			for (j = 0; j < nr_nodes; j++) {
-				node = node_order[j];
+				int node = node_order[j];
 				z = &NODE_DATA(node)->node_zones[zone_type];
 				if (populated_zone(z)) {
 					zonelist->zones[pos++] = z;
@@ -2244,6 +2247,22 @@ static void set_zonelist_order(void)
 		current_zonelist_order = user_zonelist_order;
 }
 
+/*
+ * setup_populate_map() - record nodes whose "policy_zone" is "on-node".
+ */
+static void setup_populated_map(int nid)
+{
+	pg_data_t *pgdat = NODE_DATA(nid);
+	struct zonelist *zl = pgdat->node_zonelists + policy_zone;
+	struct zone *z = zl->zones[0];
+
+	VM_BUG_ON(!z);
+	if (z->zone_pgdat == pgdat)
+		node_set_populated(nid);
+	else
+		node_not_populated(nid);
+}
+
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
@@ -2327,6 +2346,15 @@ static void set_zonelist_order(void)
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 
+/*
+ * setup_populated_map - non-NUMA case
+ * Only node 0 should be on-line, and it MUST be populated!
+ */
+static void setup_populated_map(int nid)
+{
+	node_set_populated(nid);
+}
+
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
@@ -2381,6 +2409,7 @@ static int __build_all_zonelists(void *dummy)
 	for_each_online_node(nid) {
 		build_zonelists(NODE_DATA(nid));
 		build_zonelist_cache(NODE_DATA(nid));
+		setup_populated_map(nid);
 	}
 	return 0;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-11 20:27 [PATCH] Add populated_map to account for memoryless nodes Nishanth Aravamudan, Lee Schermerhorn
@ 2007-06-11 21:25 ` Christoph Lameter
  2007-06-11 22:10   ` [PATCH v2] " Nishanth Aravamudan
  2007-06-12 14:10   ` [PATCH] " Lee Schermerhorn
  2007-06-12  2:27 ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 21:25 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> @@ -2161,7 +2164,7 @@ static int node_order[MAX_NUMNODES];
>  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
>  {
>  	enum zone_type i;
> -	int pos, j, node;
> +	int pos, j;
>  	int zone_type;		/* needs to be signed */
>  	struct zone *z;
>  	struct zonelist *zonelist;
> @@ -2171,7 +2174,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
>  		pos = 0;
>  		for (zone_type = i; zone_type >= 0; zone_type--) {
>  			for (j = 0; j < nr_nodes; j++) {
> -				node = node_order[j];
> +				int node = node_order[j];
>  				z = &NODE_DATA(node)->node_zones[zone_type];
>  				if (populated_zone(z)) {
>  					zonelist->zones[pos++] = z;

Unrelated modifications.

> @@ -2244,6 +2247,22 @@ static void set_zonelist_order(void)
>  		current_zonelist_order = user_zonelist_order;
>  }
>  
> +/*
> + * setup_populate_map() - record nodes whose "policy_zone" is "on-node".
> + */
> +static void setup_populated_map(int nid)
> +{
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +	struct zonelist *zl = pgdat->node_zonelists + policy_zone;
> +	struct zone *z = zl->zones[0];
> +
> +	VM_BUG_ON(!z);
> +	if (z->zone_pgdat == pgdat)
> +		node_set_populated(nid);
> +	else
> +		node_not_populated(nid);
> +}


A node is only populated if it has memory in the policy zone? I would say 
a node is populated if it has any memory in any zone.

The above check may fail on x86_64 where only some nodes may have 
ZONE_NORMAL. Others only have ZONE_DMA32. Policy zone will be set to 
ZONE_NORMAL.


> +
>  static void build_zonelists(pg_data_t *pgdat)
>  {
>  	int j, node, load;
> @@ -2327,6 +2346,15 @@ static void set_zonelist_order(void)
>  	current_zonelist_order = ZONELIST_ORDER_ZONE;
>  }
>  
> +/*
> + * setup_populated_map - non-NUMA case
> + * Only node 0 should be on-line, and it MUST be populated!
> + */
> +static void setup_populated_map(int nid)
> +{
> +	node_set_populated(nid);
> +}

I'd say provide fallback functions so that node_populated() always returns 
true for !NUMA. That way it can be optimized out at compile time.

>  static void build_zonelists(pg_data_t *pgdat)
>  {
>  	int node, local_node;
> @@ -2381,6 +2409,7 @@ static int __build_all_zonelists(void *dummy)
>  	for_each_online_node(nid) {
>  		build_zonelists(NODE_DATA(nid));
>  		build_zonelist_cache(NODE_DATA(nid));
> +		setup_populated_map(nid);
>  	}

Is it possible to move the set_populated_node into build_zonelists 
somehow?

F.e. In build_zonelists_node you can check if nr_zones > 0 and then set it 
up?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-11 21:25 ` Christoph Lameter
@ 2007-06-11 22:10   ` Nishanth Aravamudan
  2007-06-11 22:42     ` Christoph Lameter
  2007-06-12 14:10   ` [PATCH] " Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 22:10 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [14:25:38 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > @@ -2161,7 +2164,7 @@ static int node_order[MAX_NUMNODES];
> >  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
> >  {
> >  	enum zone_type i;
> > -	int pos, j, node;
> > +	int pos, j;
> >  	int zone_type;		/* needs to be signed */
> >  	struct zone *z;
> >  	struct zonelist *zonelist;
> > @@ -2171,7 +2174,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
> >  		pos = 0;
> >  		for (zone_type = i; zone_type >= 0; zone_type--) {
> >  			for (j = 0; j < nr_nodes; j++) {
> > -				node = node_order[j];
> > +				int node = node_order[j];
> >  				z = &NODE_DATA(node)->node_zones[zone_type];
> >  				if (populated_zone(z)) {
> >  					zonelist->zones[pos++] = z;
> 
> Unrelated modifications.

Agreed -- sorry, I have just been refreshing/testing Lee and Anton's
original fixes.

> > @@ -2244,6 +2247,22 @@ static void set_zonelist_order(void)
> >  		current_zonelist_order = user_zonelist_order;
> >  }
> >  
> > +/*
> > + * setup_populate_map() - record nodes whose "policy_zone" is "on-node".
> > + */
> > +static void setup_populated_map(int nid)
> > +{
> > +	pg_data_t *pgdat = NODE_DATA(nid);
> > +	struct zonelist *zl = pgdat->node_zonelists + policy_zone;
> > +	struct zone *z = zl->zones[0];
> > +
> > +	VM_BUG_ON(!z);
> > +	if (z->zone_pgdat == pgdat)
> > +		node_set_populated(nid);
> > +	else
> > +		node_not_populated(nid);
> > +}
> 
> 
> A node is only populated if it has memory in the policy zone? I would
> say a node is populated if it has any memory in any zone.
> 
> The above check may fail on x86_64 where only some nodes may have 
> ZONE_NORMAL. Others only have ZONE_DMA32. Policy zone will be set to 
> ZONE_NORMAL.

I agree here as well, updated below.

> >  static void build_zonelists(pg_data_t *pgdat)
> >  {
> >  	int j, node, load;
> > @@ -2327,6 +2346,15 @@ static void set_zonelist_order(void)
> >  	current_zonelist_order = ZONELIST_ORDER_ZONE;
> >  }
> >  
> > +/*
> > + * setup_populated_map - non-NUMA case
> > + * Only node 0 should be on-line, and it MUST be populated!
> > + */
> > +static void setup_populated_map(int nid)
> > +{
> > +	node_set_populated(nid);
> > +}
> 
> I'd say provide fallback functions so that node_populated() always
> returns true for !NUMA. That way it can be optimized out at compile
> time.

Already done in the original patch (node_populated() returns (node == 0)
if MAX_NUMODES <= 1), I think.

> >  static void build_zonelists(pg_data_t *pgdat)
> >  {
> >  	int node, local_node;
> > @@ -2381,6 +2409,7 @@ static int __build_all_zonelists(void *dummy)
> >  	for_each_online_node(nid) {
> >  		build_zonelists(NODE_DATA(nid));
> >  		build_zonelist_cache(NODE_DATA(nid));
> > +		setup_populated_map(nid);
> >  	}
> 
> Is it possible to move the set_populated_node into build_zonelists 
> somehow?
> 
> F.e. In build_zonelists_node you can check if nr_zones > 0 and then
> set it up?

I've tried to do this as well, please see below.

Split up Lee and Anton's original patch
(http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
populated_map changes to go in on their own.

Add a populated_map nodemask to indicate a node has memory or not.  We
have run into a number of issues (in practice and in code) with
assumptions about every node having memory. Having this nodemask allows
us to fix these issues; in particular, THISNODE allocations will come
from the node specified, only, and the INTERLEAVE policy will be able to
do the right thing with memoryless nodes.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 52c54a5..751d3d7 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -64,12 +64,16 @@
  *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
+ * int node_populated(node)		Is some node populated [at policy_zone]
  *
  * int any_online_node(mask)		First online node in mask
  *
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_populated(node)		set bit 'node' in node_populated_map
+ * node_not_populated(node)		clear bit 'node' in node_populated_map
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 
 extern nodemask_t node_online_map;
 extern nodemask_t node_possible_map;
+extern nodemask_t node_populated_map;
 
 #if MAX_NUMNODES > 1
 #define num_online_nodes()	nodes_weight(node_online_map)
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define node_populated(node)	node_isset((node), node_populated_map)
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
@@ -358,6 +364,7 @@ extern int nr_node_ids;
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define node_populated(node)	((node) == 0)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
@@ -375,6 +382,9 @@ extern int nr_node_ids;
 #define node_set_online(node)	   set_bit((node), node_online_map.bits)
 #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
 
+#define node_set_populated(node)   set_bit((node), node_populated_map.bits)
+#define node_not_populated(node)   clear_bit((node), node_populated_map.bits)
+
 #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
 #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07cd5ae..1d20f8f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_populated_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_populated_map);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2251,6 +2254,7 @@ static void build_zonelists(pg_data_t *pgdat)
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
+	struct zone *z;
 	int order = current_zonelist_order;
 
 	/* initialize zonelists */
@@ -2299,6 +2303,18 @@ static void build_zonelists(pg_data_t *pgdat)
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
+
+	/*
+	 * record nodes whose first fallback zone is "on-node" as
+	 * populated
+	 */
+	z = pgdat->node_zonelists->zones[0];
+
+	VM_BUG_ON(!z);
+	if (z->zone_pgdat == pgdat)
+		node_set_populated(local_node);
+	else
+		node_not_populated(local_node);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-11 22:10   ` [PATCH v2] " Nishanth Aravamudan
@ 2007-06-11 22:42     ` Christoph Lameter
  2007-06-11 22:52       ` [PATCH v3] " Nishanth Aravamudan
  2007-06-12 14:19       ` [PATCH v2] Add populated_map to account for " Lee Schermerhorn
  0 siblings, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 22:42 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Already done in the original patch (node_populated() returns (node == 0)
> if MAX_NUMODES <= 1), I think.

Ah good.

> @@ -2299,6 +2303,18 @@ static void build_zonelists(pg_data_t *pgdat)
>  		/* calculate node order -- i.e., DMA last! */
>  		build_zonelists_in_zone_order(pgdat, j);
>  	}
> +
> +	/*
> +	 * record nodes whose first fallback zone is "on-node" as
> +	 * populated
> +	 */
> +	z = pgdat->node_zonelists->zones[0];
> +
> +	VM_BUG_ON(!z);
> +	if (z->zone_pgdat == pgdat)
> +		node_set_populated(local_node);
> +	else
> +		node_not_populated(local_node);
>  }
>  
>  /* Construct the zonelist performance cache - see further mmzone.h */
> 

Could be much simpler:

if (pgdat->node_present_pages)
	node_set_populated(local_node);


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v3] Add populated_map to account for memoryless nodes
  2007-06-11 22:42     ` Christoph Lameter
@ 2007-06-11 22:52       ` Nishanth Aravamudan
  2007-06-11 23:00         ` Christoph Lameter
  2007-06-11 23:08         ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Nishanth Aravamudan
  2007-06-12 14:19       ` [PATCH v2] Add populated_map to account for " Lee Schermerhorn
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 22:52 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [15:42:37 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Already done in the original patch (node_populated() returns (node == 0)
> > if MAX_NUMODES <= 1), I think.
> 
> Ah good.
> 
> > @@ -2299,6 +2303,18 @@ static void build_zonelists(pg_data_t *pgdat)
> >  		/* calculate node order -- i.e., DMA last! */
> >  		build_zonelists_in_zone_order(pgdat, j);
> >  	}
> > +
> > +	/*
> > +	 * record nodes whose first fallback zone is "on-node" as
> > +	 * populated
> > +	 */
> > +	z = pgdat->node_zonelists->zones[0];
> > +
> > +	VM_BUG_ON(!z);
> > +	if (z->zone_pgdat == pgdat)
> > +		node_set_populated(local_node);
> > +	else
> > +		node_not_populated(local_node);
> >  }
> >  
> >  /* Construct the zonelist performance cache - see further mmzone.h */
> > 
> 
> Could be much simpler:
> 
> if (pgdat->node_present_pages)
> 	node_set_populated(local_node);

Err, duh -- I was thinking of making this change, but then forgot.

Thanks for the reviews, Christoph!

Split up Lee and Anton's original patch
(http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
populated_map changes to go in on their own.

Add a populated_map nodemask to indicate a node has memory or not. We
have run into a number of issues (in practice and in code) with
assumptions about every node having memory. Having this nodemask allows
us to fix these issues; in particular, THISNODE allocations will come
from the node specified, only, and the INTERLEAVE policy will be able to
do the right thing with memoryless nodes.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 52c54a5..c00a249 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -64,12 +64,16 @@
  *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
+ * int node_populated(node)		Is some node populated?
  *
  * int any_online_node(mask)		First online node in mask
  *
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_populated(node)		set bit 'node' in node_populated_map
+ * node_not_populated(node)		clear bit 'node' in node_populated_map
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 
 extern nodemask_t node_online_map;
 extern nodemask_t node_possible_map;
+extern nodemask_t node_populated_map;
 
 #if MAX_NUMNODES > 1
 #define num_online_nodes()	nodes_weight(node_online_map)
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define node_populated(node)	node_isset((node), node_populated_map)
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
@@ -358,6 +364,7 @@ extern int nr_node_ids;
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define node_populated(node)	((node) == 0)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
@@ -375,6 +382,9 @@ extern int nr_node_ids;
 #define node_set_online(node)	   set_bit((node), node_online_map.bits)
 #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
 
+#define node_set_populated(node)   set_bit((node), node_populated_map.bits)
+#define node_not_populated(node)   clear_bit((node), node_populated_map.bits)
+
 #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
 #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07cd5ae..456f2f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_populated_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_populated_map);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2299,6 +2302,13 @@ static void build_zonelists(pg_data_t *pgdat)
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
+
+	/*
+	 * record populated zones for use when INTERLEAVE'ing or using
+	 * GFP_THISNODE
+	 */
+	if (pgdat->node_present_pages)
+		node_set_populated(local_node);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v3] Add populated_map to account for memoryless nodes
  2007-06-11 22:52       ` [PATCH v3] " Nishanth Aravamudan
@ 2007-06-11 23:00         ` Christoph Lameter
  2007-06-11 23:41           ` [PATCH v4] " Nishanth Aravamudan
  2007-06-11 23:08         ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:00 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Err, duh -- I was thinking of making this change, but then forgot.

Now its fine.

Andrew please apply.

Acked-by: Christoph Lameter <clameter@sgi.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-11 22:52       ` [PATCH v3] " Nishanth Aravamudan
  2007-06-11 23:00         ` Christoph Lameter
@ 2007-06-11 23:08         ` Nishanth Aravamudan
  2007-06-11 23:10           ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
  2007-06-11 23:15           ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 23:08 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

Applies to 2.6.22-rc4-mm2 with
add-populated_map-to-account-for-memoryless-nodes
applied.

Based on ideas from Christoph Lameter, add checks in the INTERLEAVE
paths for memoryless nodes. We do not want to try interleaving onto
those nodes.

Christoph said:
"This does not work for the address based interleaving for anonymous
vmas.  I am not sure what to do there. We could change the calculation
of the node to be based only on nodes with memory and then skip the
memoryless ones. I have only added a comment to describe its brokennes
for now."

I have copied his draft comment.

Change alloc_pages_node() to fail __GFP_THISNODE allocations if the node
is not populated.

Again, Christoph said:
"This will fix the alloc_pages_node case but not the alloc_pages() case.
In the alloc_pages() case we do not specify a node. Implicitly it is
understood that we (in the case of no memory policy / cpuset options)
allocate from the nearest node. So it may be argued there that the
GFP_THISNODE behavior of taking the first node from the zonelist is
okay."

Christoph was also worried about the performance impact on these paths,
so, as he suggested, uninline alloc_pages_node() and move it to
mempolicy.c.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 49dcc2f..c83e56a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -165,19 +165,7 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
-						unsigned int order)
-{
-	if (unlikely(order >= MAX_ORDER))
-		return NULL;
-
-	/* Unknown node is current node */
-	if (nid < 0)
-		nid = numa_node_id();
-
-	return __alloc_pages(gfp_mask, order,
-		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
-}
+extern struct page *alloc_pages_node(int, gfp_t, unsigned int);
 
 #ifdef CONFIG_NUMA
 extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 144805c..4e7c48c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -174,6 +174,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	unsigned nid;
 
 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 	if (mode == MPOL_DEFAULT)
@@ -184,6 +185,16 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	atomic_set(&policy->refcnt, 1);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
+		/*
+		 * Clear any memoryless nodes here so that v.nodes can be used
+		 * without extra checks
+		 */
+		nid = first_node(*nodes);
+		while (nid < MAX_NUMNODES) {
+			if (!node_populated(nid))
+				node_clear(nid, *nodes);
+			nid = next_node(nid, *nodes);
+		}
 		policy->v.nodes = *nodes;
 		if (nodes_weight(*nodes) == 0) {
 			kmem_cache_free(policy_cache, policy);
@@ -578,6 +589,22 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+	if (unlikely(order >= MAX_ORDER))
+		return NULL;
+
+	/* Unknown node is current node */
+	if (nid < 0)
+		nid = numa_node_id();
+
+	if ((gfp_mask & __GFP_THISNODE) && !node_populated(nid))
+		return NULL;
+
+	return __alloc_pages(gfp_mask, order,
+		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
+}
+
 #ifdef CONFIG_MIGRATION
 /*
  * page migration
@@ -1126,9 +1153,11 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+	do {
+		next = next_node(nid, policy->v.nodes);
+		if (next >= MAX_NUMNODES)
+			next = first_node(policy->v.nodes);
+	} while (!node_populated(next));
 	me->il_next = next;
 	return nid;
 }
@@ -1192,6 +1221,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		 * for huge pages, since vm_pgoff is in units of small
 		 * pages, we need to shift off the always 0 bits to get
 		 * a useful offset.
+		 *
+		 * NOTE: For configurations with memoryless nodes this
+		 * is broken since the allocation attempts on that node
+		 * will fall back to other nodes and thus one
+		 * neighboring node will be overallocated from.
 		 */
 		BUG_ON(shift < PAGE_SHIFT);
 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-11 23:08         ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Nishanth Aravamudan
@ 2007-06-11 23:10           ` Nishanth Aravamudan
  2007-06-11 23:11             ` [PATCH][RFC] hugetlb: numafy several functions Nishanth Aravamudan
  2007-06-11 23:17             ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Christoph Lameter
  2007-06-11 23:15           ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 23:10 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

Applies to 2.6.22-rc4-mm2 with
add-populated_map-to-account-for-memoryless-nodes
fix-interleave-with-memoryless-nodes
applied.

Split Lee and Anton's patch
(http://marc.info/?l=linux-mm&m=118133042025995&w=2) into two parts.

Only attempt to allocate huge pages on nodes that contain memory, as
specified by node_populated_map.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 858c0b3..97ae1a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,13 +105,22 @@ static void free_huge_page(struct page *page)
 
 static int alloc_fresh_huge_page(void)
 {
-	static int nid = 0;
+	static int nid = -1;
 	struct page *page;
-	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
-	nid = next_node(nid, node_online_map);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(node_online_map);
+	int start_nid;
+
+	if (nid < 0)
+		nid = first_node(node_populated_map);
+	start_nid = nid;
+
+	do {
+		page = alloc_pages_node(nid,
+				GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+				HUGETLB_PAGE_ORDER);
+		nid = next_node(nid, node_populated_map);
+		if (nid >= nr_node_ids)
+			nid = first_node(node_populated_map);
+	} while (!page && nid != start_nid);
 	if (page) {
 		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH][RFC] hugetlb: numafy several functions
  2007-06-11 23:10           ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
@ 2007-06-11 23:11             ` Nishanth Aravamudan
  2007-06-11 23:13               ` [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
  2007-06-11 23:38               ` [PATCH][RFC] hugetlb: numafy several functions Christoph Lameter
  2007-06-11 23:17             ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 23:11 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

Applies to 2.6.22-rc4-mm2 with
add-populated_map-to-account-for-memoryless-nodes
fix-interleave-with-memoryless-nodes
fix-hugetlb-pool-allocation-with-empty-nodes
applied.

Add node-parameterized helpers for dequeue_huge_page,
alloc_fresh_huge_page and try_to_free_low. Also have
update_and_free_page() take a nid parameter. This is necessary to add a
per-node sysfs attribute to specify the number of hugepages on that
node.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97ae1a3..d1e1063 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,11 +66,22 @@ static void enqueue_huge_page(struct page *page)
 	free_huge_pages_node[nid]++;
 }
 
+static struct page *dequeue_huge_page_node(int nid)
+{
+	struct page *page;
+
+	page = list_entry(hugepage_freelists[nid].next,
+					  struct page, lru);
+	list_del(&page->lru);
+	free_huge_pages--;
+	free_huge_pages_node[nid]--;
+	return page;
+}
+
 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 				unsigned long address)
 {
 	int nid;
-	struct page *page = NULL;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 						htlb_alloc_mask);
 	struct zone **z;
@@ -82,14 +93,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 			break;
 	}
 
-	if (*z) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, lru);
-		list_del(&page->lru);
-		free_huge_pages--;
-		free_huge_pages_node[nid]--;
-	}
-	return page;
+	if (*z)
+		return dequeue_huge_page_node(nid);
+	return NULL;
 }
 
 static void free_huge_page(struct page *page)
@@ -103,6 +109,25 @@ static void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+	struct page *page;
+
+	page = alloc_pages_node(nid,
+			GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+			HUGETLB_PAGE_ORDER);
+	if (page) {
+		set_compound_page_dtor(page, free_huge_page);
+		spin_lock(&hugetlb_lock);
+		nr_huge_pages++;
+		nr_huge_pages_node[nid]++;
+		spin_unlock(&hugetlb_lock);
+		put_page(page); /* free it into the hugepage allocator */
+	}
+
+	return page;
+}
+
 static int alloc_fresh_huge_page(void)
 {
 	static int nid = -1;
@@ -114,22 +139,14 @@ static int alloc_fresh_huge_page(void)
 	start_nid = nid;
 
 	do {
-		page = alloc_pages_node(nid,
-				GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
-				HUGETLB_PAGE_ORDER);
+		page = alloc_fresh_huge_page_node(nid);
 		nid = next_node(nid, node_populated_map);
 		if (nid >= nr_node_ids)
 			nid = first_node(node_populated_map);
 	} while (!page && nid != start_nid);
-	if (page) {
-		set_compound_page_dtor(page, free_huge_page);
-		spin_lock(&hugetlb_lock);
-		nr_huge_pages++;
-		nr_huge_pages_node[page_to_nid(page)]++;
-		spin_unlock(&hugetlb_lock);
-		put_page(page); /* free it into the hugepage allocator */
+
+	if (page)
 		return 1;
-	}
 	return 0;
 }
 
@@ -199,11 +216,11 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 }
 
 #ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(int nid, struct page *page)
 {
 	int i;
 	nr_huge_pages--;
-	nr_huge_pages_node[page_to_nid(page)]--;
+	nr_huge_pages_node[nid]--;
 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -215,25 +232,37 @@ static void update_and_free_page(struct page *page)
 }
 
 #ifdef CONFIG_HIGHMEM
+static void try_to_free_low_node(int nid, unsigned long count)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next,
+				&hugepage_freelists[nid], lru) {
+		if (PageHighMem(page))
+			continue;
+		list_del(&page->lru);
+		update_and_free_page(nid, page);
+		free_huge_pages--;
+		free_huge_pages_node[nid]--;
+		if (count >= nr_huge_pages_node[nid])
+			return;
+	}
+}
+
 static void try_to_free_low(unsigned long count)
 {
 	int i;
 
 	for (i = 0; i < MAX_NUMNODES; ++i) {
-		struct page *page, *next;
-		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
-			if (PageHighMem(page))
-				continue;
-			list_del(&page->lru);
-			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[page_to_nid(page)]--;
-			if (count >= nr_huge_pages)
-				return;
-		}
+		try_to_free_low_node(i, count);
+		if (count >= nr_huge_pages)
+			break;
 	}
 }
 #else
+static inline void try_to_free_low_node(int nid, unsigned long count)
+{
+}
 static inline void try_to_free_low(unsigned long count)
 {
 }
@@ -255,7 +284,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
-		update_and_free_page(page);
+		update_and_free_page(page_to_nid(page), page);
 	}
 	spin_unlock(&hugetlb_lock);
 	return nr_huge_pages;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-11 23:11             ` [PATCH][RFC] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-06-11 23:13               ` Nishanth Aravamudan
  2007-06-11 23:40                 ` Christoph Lameter
  2007-06-11 23:42                 ` Christoph Lameter
  2007-06-11 23:38               ` [PATCH][RFC] hugetlb: numafy several functions Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 23:13 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

Applies to 2.6.22-rc4-mm2 with
add-populated_map-to-account-for-memoryless-nodes
fix-interleave-with-memoryless-nodes
fix-hugetlb-pool-allocation-with-empty-nodes
hugetlb-numafy-several-functions
applied.

Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface).

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

---

I would have liked to have avoided the #ifdef's in node.c, but I
couldn't figure out a simple way to conditionalize the
create_file/remove_file calls.

diff --git a/drivers/base/node.c b/drivers/base/node.c
index cae346e..fc0b4a1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -131,6 +131,11 @@ static ssize_t node_read_distance(struct sys_device * dev, char * buf)
 }
 static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
 
+#ifdef CONFIG_HUGETLB_PAGE
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+				hugetlb_read_nr_hugepages_node,
+				hugetlb_write_nr_hugepages_node);
+#endif
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -151,6 +156,9 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+#ifdef CONFIG_HUGETLB_PAGE
+		sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+#endif
 	}
 	return error;
 }
@@ -168,6 +176,9 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_meminfo);
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
+#ifdef CONFIG_HUGETLB_PAGE
+	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+#endif
 
 	sysdev_unregister(&node->sysdev);
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aa0dc9b..7df75c1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -5,6 +5,7 @@
 
 #include <linux/mempolicy.h>
 #include <linux/shm.h>
+#include <linux/sysdev.h>
 #include <asm/tlbflush.h>
 
 struct ctl_table;
@@ -23,6 +24,9 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *, char *);
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *, const char *,
+					 size_t);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d1e1063..9f1cb16 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -215,7 +215,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 	return nr;
 }
 
-#ifdef CONFIG_SYSCTL
 static void update_and_free_page(int nid, struct page *page)
 {
 	int i;
@@ -268,6 +267,7 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
 
+#ifdef CONFIG_SYSCTL
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	while (count > nr_huge_pages) {
@@ -335,6 +335,58 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+							char *buf)
+{
+	return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+					const char *buf, size_t count)
+{
+	int nid = dev->id;
+	unsigned long target;
+	unsigned long free_on_other_nodes;
+	unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+ 	/*
+	 * unpopulated nodes can return pages from other nodes for
+	 * THISNODE requests, so do a populated check and avoid
+	 * double-checking in the sysctl path
+ 	 */
+ 	if (!node_populated(nid))
+ 		return count;
+ 
+	while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+		if (!alloc_fresh_huge_page_node(nid))
+			return count;
+	}
+	if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+		return count;
+
+	/* need to ensure that our counts are accurate */
+	spin_lock(&hugetlb_lock);
+	free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+	if (free_on_other_nodes >= resv_huge_pages) {
+		/* other nodes can satisfy reserve */
+		target = nr_huge_pages_req;
+	} else {
+		/* this node needs some free to satisfy reserve */
+		target = max((resv_huge_pages - free_on_other_nodes),
+						nr_huge_pages_req);
+	}
+	try_to_free_low_node(nid, target);
+	while (target < nr_huge_pages_node[nid]) {
+		struct page *page = dequeue_huge_page_node(nid);
+		if (!page)
+			break;
+		update_and_free_page(nid, page);
+	}
+	spin_unlock(&hugetlb_lock);
+
+	return count;
+}
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-11 23:08         ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Nishanth Aravamudan
  2007-06-11 23:10           ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
@ 2007-06-11 23:15           ` Christoph Lameter
  2007-06-12  0:14             ` [PATCH v2][RFC] " Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:15 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Christoph was also worried about the performance impact on these paths,
> so, as he suggested, uninline alloc_pages_node() and move it to
> mempolicy.c.

uninlining does not address performance issues.

> @@ -184,6 +185,16 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
>  	atomic_set(&policy->refcnt, 1);
>  	switch (mode) {
>  	case MPOL_INTERLEAVE:
> +		/*
> +		 * Clear any memoryless nodes here so that v.nodes can be used
> +		 * without extra checks
> +		 */
> +		nid = first_node(*nodes);
> +		while (nid < MAX_NUMNODES) {
> +			if (!node_populated(nid))
> +				node_clear(nid, *nodes);
> +			nid = next_node(nid, *nodes);
> +		}

There is a "nodes_and" function for this.

> @@ -1126,9 +1153,11 @@ static unsigned interleave_nodes(struct mempolicy *policy)
>  	struct task_struct *me = current;
>  
>  	nid = me->il_next;
> -	next = next_node(nid, policy->v.nodes);
> -	if (next >= MAX_NUMNODES)
> -		next = first_node(policy->v.nodes);
> +	do {
> +		next = next_node(nid, policy->v.nodes);
> +		if (next >= MAX_NUMNODES)
> +			next = first_node(policy->v.nodes);
> +	} while (!node_populated(next));

Is there a case where nodes has no node set?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-11 23:10           ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
  2007-06-11 23:11             ` [PATCH][RFC] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-06-11 23:17             ` Christoph Lameter
  2007-06-12  0:15               ` Nishanth Aravamudan
  2007-06-12 14:28               ` [PATCH v6][RFC] " Lee Schermerhorn
  1 sibling, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:17 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> +	if (nid < 0)
> +		nid = first_node(node_populated_map);

nid == 1 means local node? Or why do we check for nid < 0?

	if (nid == 1)
		 nid = numa_node_id();

?
> +	do {
> +		page = alloc_pages_node(nid,
> +				GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
> +				HUGETLB_PAGE_ORDER);
> +		nid = next_node(nid, node_populated_map);
> +		if (nid >= nr_node_ids)
> +			nid = first_node(node_populated_map);
> +	} while (!page && nid != start_nid);

Looks good.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: numafy several functions
  2007-06-11 23:11             ` [PATCH][RFC] hugetlb: numafy several functions Nishanth Aravamudan
  2007-06-11 23:13               ` [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
@ 2007-06-11 23:38               ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:38 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

> Add node-parameterized helpers for dequeue_huge_page,
> alloc_fresh_huge_page and try_to_free_low. Also have
> update_and_free_page() take a nid parameter. This is necessary to add a
> per-node sysfs attribute to specify the number of hugepages on that
> node.

Acked-by: Christoph Lameter <clameter@sgi.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-11 23:13               ` [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
@ 2007-06-11 23:40                 ` Christoph Lameter
  2007-06-11 23:42                 ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:40 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> + 	/*
> +	 * unpopulated nodes can return pages from other nodes for
> +	 * THISNODE requests, so do a populated check and avoid
> +	 * double-checking in the sysctl path
> + 	 */
> + 	if (!node_populated(nid))
> + 		return count;

THISNODE fixes address this issue right?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v4] Add populated_map to account for memoryless nodes
  2007-06-11 23:00         ` Christoph Lameter
@ 2007-06-11 23:41           ` Nishanth Aravamudan
  2007-06-11 23:45             ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-11 23:41 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [16:00:39 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Err, duh -- I was thinking of making this change, but then forgot.
> 
> Now its fine.

Eep, except that we don't initialize node_populated_mask unless we're
NUMA. Also, do you think it's worth adding the comment in mmzone.h that
now now NUMA policies depend on present_pages?

Split up Lee and Anton's original patch
(http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
populated_map changes to go in on their own.

Add a populated_map nodemask to indicate a node has memory or not. We
have run into a number of issues (in practice and in code) with
assumptions about every node having memory. Having this nodemask allows
us to fix these issues; in particular, THISNODE allocations will come
from the node specified, only, and the INTERLEAVE policy will be able to
do the right thing with memoryless nodes.

In the !NUMA case, we assume the pgdat represents a populated node.
This is needed for follow-on patches which assume the mask works in NUMA
and !NUMA.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f964af6..8e3f43e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -337,6 +337,10 @@ struct zone {
 	 * give them a chance of being in the same cacheline.
 	 */
 	unsigned long		spanned_pages;	/* total size, including holes */
+	/*
+	 * present_pages is assumed to indicate if a NUMA node is
+	 * populated or not
+	 */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 
 	/*
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 52c54a5..c00a249 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -64,12 +64,16 @@
  *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
+ * int node_populated(node)		Is some node populated?
  *
  * int any_online_node(mask)		First online node in mask
  *
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_populated(node)		set bit 'node' in node_populated_map
+ * node_not_populated(node)		clear bit 'node' in node_populated_map
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
 
 extern nodemask_t node_online_map;
 extern nodemask_t node_possible_map;
+extern nodemask_t node_populated_map;
 
 #if MAX_NUMNODES > 1
 #define num_online_nodes()	nodes_weight(node_online_map)
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define node_populated(node)	node_isset((node), node_populated_map)
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
@@ -358,6 +364,7 @@ extern int nr_node_ids;
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define node_populated(node)	((node) == 0)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
@@ -375,6 +382,9 @@ extern int nr_node_ids;
 #define node_set_online(node)	   set_bit((node), node_online_map.bits)
 #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
 
+#define node_set_populated(node)   set_bit((node), node_populated_map.bits)
+#define node_not_populated(node)   clear_bit((node), node_populated_map.bits)
+
 #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
 #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07cd5ae..526c126 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_populated_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_populated_map);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2299,6 +2302,13 @@ static void build_zonelists(pg_data_t *pgdat)
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
+
+	/*
+	 * record populated zones for use when INTERLEAVE'ing or using
+	 * GFP_THISNODE
+	 */
+	if (pgdat->node_present_pages)
+		node_set_populated(local_node);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */
@@ -2360,6 +2370,8 @@ static void build_zonelists(pg_data_t *pgdat)
 
 		zonelist->zones[j] = NULL;
 	}
+
+	node_set_populated(local_node);
 }
 
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-11 23:13               ` [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
  2007-06-11 23:40                 ` Christoph Lameter
@ 2007-06-11 23:42                 ` Christoph Lameter
  2007-06-12  0:19                   ` Nishanth Aravamudan
  2007-06-12  2:19                   ` Nishanth Aravamudan
  1 sibling, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:42 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

>  }
>  static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
>  
> +#ifdef CONFIG_HUGETLB_PAGE
> +static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
> +				hugetlb_read_nr_hugepages_node,
> +				hugetlb_write_nr_hugepages_node);
> +#endif

Move the above to hugetlb.c?

Also so far there is nothing in the nodes directories that can be 
modified. This is the first one. Is that really the right location?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v4] Add populated_map to account for memoryless nodes
  2007-06-11 23:41           ` [PATCH v4] " Nishanth Aravamudan
@ 2007-06-11 23:45             ` Christoph Lameter
  2007-06-12  0:07               ` [PATCH] populated_map: fix !NUMA case, remove comment Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-11 23:45 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Eep, except that we don't initialize node_populated_mask unless we're
> NUMA. Also, do you think it's worth adding the comment in mmzone.h that
> now now NUMA policies depend on present_pages?

No need to initialize if we do not use it. You may to #ifdef it out
by moving the definition. Please sent a diff against the earlier patch 
since Andrew already merged it.

present_pages just indicates that there is memory on the node. So I am not 
sure that this will help.

> +
> +	/*
> +	 * record populated zones for use when INTERLEAVE'ing or using
> +	 * GFP_THISNODE
> +	 */

There may be other purposes as well. No need to enumerate those here.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-11 23:45             ` Christoph Lameter
@ 2007-06-12  0:07               ` Nishanth Aravamudan
  2007-06-12  0:41                 ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  0:07 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [16:45:30 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Eep, except that we don't initialize node_populated_mask unless we're
> > NUMA. Also, do you think it's worth adding the comment in mmzone.h that
> > now now NUMA policies depend on present_pages?
> 
> No need to initialize if we do not use it. You may to #ifdef it out
> by moving the definition. Please sent a diff against the earlier patch 
> since Andrew already merged it.

We will be using it (it == node_populated_mask) later in my sysfs patch
and in the fix hugepage allocation patch.

Sorry, sent the updated patch before I got Andrew's mail.

> present_pages just indicates that there is memory on the node. So I am not 
> sure that this will help.

Ok.

> > +
> > +	/*
> > +	 * record populated zones for use when INTERLEAVE'ing or using
> > +	 * GFP_THISNODE
> > +	 */
> 
> There may be other purposes as well. No need to enumerate those here.

Ok.

Applies on top of add-populated_map-to-account-for-memoryless-nodes.patch.

populated_map needs to be consistent in both the NUMA and !NUMA cases to
fix hugepage allocation with empty nodes. Assume the one node in the
!NUMA case is populated.

Remove a comment that would only increase the maintenance burden.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 456f2f6..825d2df 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2303,10 +2303,6 @@ static void build_zonelists(pg_data_t *pgdat)
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 
-	/*
-	 * record populated zones for use when INTERLEAVE'ing or using
-	 * GFP_THISNODE
-	 */
 	if (pgdat->node_present_pages)
 		node_set_populated(local_node);
 }
@@ -2370,6 +2366,8 @@ static void build_zonelists(pg_data_t *pgdat)
 
 		zonelist->zones[j] = NULL;
 	}
+
+	node_set_populated(local_node);
 }
 
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-11 23:15           ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Christoph Lameter
@ 2007-06-12  0:14             ` Nishanth Aravamudan
  2007-06-12  0:42               ` Christoph Lameter
  2007-06-12  0:57               ` Andrew Morton
  0 siblings, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  0:14 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [16:15:15 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Christoph was also worried about the performance impact on these paths,
> > so, as he suggested, uninline alloc_pages_node() and move it to
> > mempolicy.c.
> 
> uninlining does not address performance issues.
> 
> > @@ -184,6 +185,16 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
> >  	atomic_set(&policy->refcnt, 1);
> >  	switch (mode) {
> >  	case MPOL_INTERLEAVE:
> > +		/*
> > +		 * Clear any memoryless nodes here so that v.nodes can be used
> > +		 * without extra checks
> > +		 */
> > +		nid = first_node(*nodes);
> > +		while (nid < MAX_NUMNODES) {
> > +			if (!node_populated(nid))
> > +				node_clear(nid, *nodes);
> > +			nid = next_node(nid, *nodes);
> > +		}
> 
> There is a "nodes_and" function for this.

Right, fixed.

> > @@ -1126,9 +1153,11 @@ static unsigned interleave_nodes(struct mempolicy *policy)
> >  	struct task_struct *me = current;
> >  
> >  	nid = me->il_next;
> > -	next = next_node(nid, policy->v.nodes);
> > -	if (next >= MAX_NUMNODES)
> > -		next = first_node(policy->v.nodes);
> > +	do {
> > +		next = next_node(nid, policy->v.nodes);
> > +		if (next >= MAX_NUMNODES)
> > +			next = first_node(policy->v.nodes);
> > +	} while (!node_populated(next));
> 
> Is there a case where nodes has no node set?

Well, if the only place to get a "new" policy is mpol_new(), no, as just
after the above nodes_and(), we check the weight of the nodemask. Is
that sufficient?

Based on ideas from Christoph Lameter, add checks in the INTERLEAVE
paths for memoryless nodes. We do not want to try interleaving onto
those nodes.

Christoph said:
"This does not work for the address based interleaving for anonymous
vmas.  I am not sure what to do there. We could change the calculation
of the node to be based only on nodes with memory and then skip the
memoryless ones. I have only added a comment to describe its brokennes
for now."

I have copied his draft's comment.

Change alloc_pages_node() to fail __GFP_THISNODE allocations if the node
is not populated.

Again, Christoph said:
"This will fix the alloc_pages_node case but not the alloc_pages() case.
In the alloc_pages() case we do not specify a node. Implicitly it is
understood that we (in the case of no memory policy / cpuset options)
allocate from the nearest node. So it may be argued there that the
GFP_THISNODE behavior of taking the first node from the zonelist is
okay."

Christoph was also worried about the performance impact on these paths,
as am I.

Finally, as he suggested, uninline alloc_pages_node() and move it to
mempolicy.c.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 49dcc2f..c83e56a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -165,19 +165,7 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
-						unsigned int order)
-{
-	if (unlikely(order >= MAX_ORDER))
-		return NULL;
-
-	/* Unknown node is current node */
-	if (nid < 0)
-		nid = numa_node_id();
-
-	return __alloc_pages(gfp_mask, order,
-		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
-}
+extern struct page * alloc_pages_node(int, gfp_t, unsigned int);
 
 #ifdef CONFIG_NUMA
 extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 144805c..abadbf4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -174,6 +174,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	unsigned nid;
 
 	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 	if (mode == MPOL_DEFAULT)
@@ -184,8 +185,12 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 	atomic_set(&policy->refcnt, 1);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
-		if (nodes_weight(*nodes) == 0) {
+		/*
+		 * Clear any memoryless nodes here so that v.nodes can be used
+		 * without extra checks
+		 */
+		nodes_and(policy->v.nodes, *nodes, node_populated_mask);
+		if (nodes_weight(policy->v.nodes) == 0) {
 			kmem_cache_free(policy_cache, policy);
 			return ERR_PTR(-EINVAL);
 		}
@@ -578,6 +583,22 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+	if (unlikely(order >= MAX_ORDER))
+		return NULL;
+
+	/* Unknown node is current node */
+	if (nid < 0)
+		nid = numa_node_id();
+
+	if ((gfp_mask & __GFP_THISNODE) && !node_populated(nid))
+		return NULL;
+
+	return __alloc_pages(gfp_mask, order,
+		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
+}
+
 #ifdef CONFIG_MIGRATION
 /*
  * page migration
@@ -1126,9 +1147,11 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+	do {
+		next = next_node(nid, policy->v.nodes);
+		if (next >= MAX_NUMNODES)
+			next = first_node(policy->v.nodes);
+	} while (!node_populated(next));
 	me->il_next = next;
 	return nid;
 }
@@ -1192,6 +1215,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		 * for huge pages, since vm_pgoff is in units of small
 		 * pages, we need to shift off the always 0 bits to get
 		 * a useful offset.
+		 *
+		 * NOTE: For configurations with memoryless nodes this
+		 * is broken since the allocation attempts on that node
+		 * will fall back to other nodes and thus one
+		 * neighboring node will be overallocated from.
 		 */
 		BUG_ON(shift < PAGE_SHIFT);
 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-11 23:17             ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Christoph Lameter
@ 2007-06-12  0:15               ` Nishanth Aravamudan
  2007-06-12  0:47                 ` Christoph Lameter
  2007-06-12  3:44                 ` William Lee Irwin III
  2007-06-12 14:28               ` [PATCH v6][RFC] " Lee Schermerhorn
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  0:15 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On 11.06.2007 [16:17:47 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > +	if (nid < 0)
> > +		nid = first_node(node_populated_map);
> 
> nid == 1 means local node? Or why do we check for nid < 0?
> 
> 	if (nid == 1)
> 		 nid = numa_node_id();
> 
> ?

No, nid is a static variable. So we initialize it to -1 to catch the
first time we go through the loop.

IIRC, we can't just set it to first_node(node_populated_map), because
it's a non-constant or something?

> > +	do {
> > +		page = alloc_pages_node(nid,
> > +				GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
> > +				HUGETLB_PAGE_ORDER);
> > +		nid = next_node(nid, node_populated_map);
> > +		if (nid >= nr_node_ids)
> > +			nid = first_node(node_populated_map);
> > +	} while (!page && nid != start_nid);
> 
> Looks good.

Thanks,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-11 23:42                 ` Christoph Lameter
@ 2007-06-12  0:19                   ` Nishanth Aravamudan
  2007-06-12  0:43                     ` Christoph Lameter
  2007-06-12  2:19                   ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  0:19 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On 11.06.2007 [16:42:33 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> >  }
> >  static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
> >  
> > +#ifdef CONFIG_HUGETLB_PAGE
> > +static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
> > +				hugetlb_read_nr_hugepages_node,
> > +				hugetlb_write_nr_hugepages_node);
> > +#endif
> 
> Move the above to hugetlb.c?
> 
> Also so far there is nothing in the nodes directories that can be 
> modified. This is the first one. Is that really the right location?

I will rework this taking into account this comment and the other
(sorry, forgot to refresh the patch and the commentary after the other
changes).

As far as moving it, that might be the best option, but I'm not sure how
to make sure that everything builds. Will ponder.

Also, I don't really see where else a per-node attribute should go other
than /sys/devices/system/node/nodeX... Did you have a better location in
mind?

Thanks,
Nish

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  0:07               ` [PATCH] populated_map: fix !NUMA case, remove comment Nishanth Aravamudan
@ 2007-06-12  0:41                 ` Christoph Lameter
  2007-06-12  1:43                   ` Nishanth Aravamudan
  2007-06-12  2:02                   ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  0:41 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> > No need to initialize if we do not use it. You may to #ifdef it out
> > by moving the definition. Please sent a diff against the earlier patch 
> > since Andrew already merged it.
> 
> We will be using it (it == node_populated_mask) later in my sysfs patch
> and in the fix hugepage allocation patch.

But not in the !NUMA case. So the definition of the node_populated_mask 
can be moved into an #ifdef CONFIG_NUMA chunk in page_alloc.c and we can 
have fallback functions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-12  0:14             ` [PATCH v2][RFC] " Nishanth Aravamudan
@ 2007-06-12  0:42               ` Christoph Lameter
  2007-06-12  0:57               ` Andrew Morton
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  0:42 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Well, if the only place to get a "new" policy is mpol_new(), no, as just
> after the above nodes_and(), we check the weight of the nodemask. Is
> that sufficient?

Yes. That is good.

Acked-by: Christoph Lameter <clameter@sgi.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-12  0:19                   ` Nishanth Aravamudan
@ 2007-06-12  0:43                     ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  0:43 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Also, I don't really see where else a per-node attribute should go other
> than /sys/devices/system/node/nodeX... Did you have a better location in
> mind?

No I think this is okay but doing so sets a precedent. If you do this then 
others will add more config stuff to that directory. So we need the 
feedback from others.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  0:15               ` Nishanth Aravamudan
@ 2007-06-12  0:47                 ` Christoph Lameter
  2007-06-12  2:12                   ` Nishanth Aravamudan
  2007-06-12  3:44                 ` William Lee Irwin III
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  0:47 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> On 11.06.2007 [16:17:47 -0700], Christoph Lameter wrote:
> > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > +	if (nid < 0)
> > > +		nid = first_node(node_populated_map);
> > 
> > nid == 1 means local node? Or why do we check for nid < 0?
> > 
> > 	if (nid == 1)
> > 		 nid = numa_node_id();
> > 
> > ?
> 
> No, nid is a static variable. So we initialize it to -1 to catch the
> first time we go through the loop.
> 
> IIRC, we can't just set it to first_node(node_populated_map), because
> it's a non-constant or something?

Sure, you can initialize a c variable from another. So drop the -1.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-12  0:14             ` [PATCH v2][RFC] " Nishanth Aravamudan
  2007-06-12  0:42               ` Christoph Lameter
@ 2007-06-12  0:57               ` Andrew Morton
  2007-06-12  1:12                 ` Christoph Lameter
  2007-06-12  1:41                 ` Nishanth Aravamudan
  1 sibling, 2 replies; 140+ messages in thread
From: Andrew Morton @ 2007-06-12  0:57 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Christoph Lameter, lee.schermerhorn, anton, linux-mm

On Mon, 11 Jun 2007 17:14:36 -0700 Nishanth Aravamudan <nacc@us.ibm.com> wrote:

> 
> Christoph said:
> "This does not work for the address based interleaving for anonymous
> vmas.  I am not sure what to do there. We could change the calculation
> of the node to be based only on nodes with memory and then skip the
> memoryless ones. I have only added a comment to describe its brokennes
> for now."
> 
> I have copied his draft's comment.
> 
> Change alloc_pages_node() to fail __GFP_THISNODE allocations if the node
> is not populated.
> 
> Again, Christoph said:
> "This will fix the alloc_pages_node case but not the alloc_pages() case.
> In the alloc_pages() case we do not specify a node. Implicitly it is
> understood that we (in the case of no memory policy / cpuset options)
> allocate from the nearest node. So it may be argued there that the
> GFP_THISNODE behavior of taking the first node from the zonelist is
> okay."
> 
> Christoph was also worried about the performance impact on these paths,
> as am I.
> 
> Finally, as he suggested, uninline alloc_pages_node() and move it to
> mempolicy.c.
> 

All confused.

> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 49dcc2f..c83e56a 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -165,19 +165,7 @@ static inline void arch_alloc_page(struct page *page, int order) { }
>  extern struct page *
>  FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
>  
> -static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
> -						unsigned int order)
> -{
> -	if (unlikely(order >= MAX_ORDER))
> -		return NULL;
> -
> -	/* Unknown node is current node */
> -	if (nid < 0)
> -		nid = numa_node_id();
> -
> -	return __alloc_pages(gfp_mask, order,
> -		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
> -}
> +extern struct page * alloc_pages_node(int, gfp_t, unsigned int);
>  
>  #ifdef CONFIG_NUMA
>  extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 144805c..abadbf4 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -174,6 +174,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
>  static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
>  {
>  	struct mempolicy *policy;
> +	unsigned nid;

This variable appears to be unneeded.

>  	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
>  	if (mode == MPOL_DEFAULT)
> @@ -184,8 +185,12 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
>  	atomic_set(&policy->refcnt, 1);
>  	switch (mode) {
>  	case MPOL_INTERLEAVE:
> -		policy->v.nodes = *nodes;
> -		if (nodes_weight(*nodes) == 0) {
> +		/*
> +		 * Clear any memoryless nodes here so that v.nodes can be used
> +		 * without extra checks
> +		 */
> +		nodes_and(policy->v.nodes, *nodes, node_populated_mask);
> +		if (nodes_weight(policy->v.nodes) == 0) {
>  			kmem_cache_free(policy_cache, policy);
>  			return ERR_PTR(-EINVAL);
>  		}

I have no node_populated_mask.

The below improves the situation, but I wonder about, ahem, the maturity of
this code.



From: Andrew Morton <akpm@linux-foundation.org>

- Fix checkpatch.pl warning

- Fix build

- Fix unused var warning

Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/gfp.h |    2 +-
 mm/mempolicy.c      |    3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff -puN include/linux/gfp.h~fix-interleave-with-memoryless-nodes-fix include/linux/gfp.h
--- a/include/linux/gfp.h~fix-interleave-with-memoryless-nodes-fix
+++ a/include/linux/gfp.h
@@ -130,7 +130,7 @@ static inline void arch_alloc_page(struc
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
-extern struct page * alloc_pages_node(int, gfp_t, unsigned int);
+extern struct page *alloc_pages_node(int, gfp_t, unsigned int);
 
 #ifdef CONFIG_NUMA
 extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
diff -puN mm/mempolicy.c~fix-interleave-with-memoryless-nodes-fix mm/mempolicy.c
--- a/mm/mempolicy.c~fix-interleave-with-memoryless-nodes-fix
+++ a/mm/mempolicy.c
@@ -172,7 +172,6 @@ static struct zonelist *bind_zonelist(no
 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
-	unsigned nid;
 
 	pr_debug("setting mode %d nodes[0] %lx\n",
 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -189,7 +188,7 @@ static struct mempolicy *mpol_new(int mo
 		 * Clear any memoryless nodes here so that v.nodes can be used
 		 * without extra checks
 		 */
-		nodes_and(policy->v.nodes, *nodes, node_populated_mask);
+		nodes_and(policy->v.nodes, *nodes, node_populated_map);
 		if (nodes_weight(policy->v.nodes) == 0) {
 			kmem_cache_free(policy_cache, policy);
 			return ERR_PTR(-EINVAL);
_



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-12  0:57               ` Andrew Morton
@ 2007-06-12  1:12                 ` Christoph Lameter
  2007-06-12  1:41                 ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  1:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Nishanth Aravamudan, lee.schermerhorn, anton, linux-mm

On Mon, 11 Jun 2007, Andrew Morton wrote:

> > +	unsigned nid;
> 
> This variable appears to be unneeded.
> 
> >  	PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
> >  	if (mode == MPOL_DEFAULT)
> > @@ -184,8 +185,12 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
> >  	atomic_set(&policy->refcnt, 1);
> >  	switch (mode) {
> >  	case MPOL_INTERLEAVE:
> > -		policy->v.nodes = *nodes;
> > -		if (nodes_weight(*nodes) == 0) {
> > +		/*
> > +		 * Clear any memoryless nodes here so that v.nodes can be used
> > +		 * without extra checks
> > +		 */
> > +		nodes_and(policy->v.nodes, *nodes, node_populated_mask);
> > +		if (nodes_weight(policy->v.nodes) == 0) {
> >  			kmem_cache_free(policy_cache, policy);
> >  			return ERR_PTR(-EINVAL);
> >  		}
> 
> I have no node_populated_mask.
> 
> The below improves the situation, but I wonder about, ahem, the maturity of
> this code.

Yeah. No one compiled it. But I think we have the general outline how this 
could be done.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-12  0:57               ` Andrew Morton
  2007-06-12  1:12                 ` Christoph Lameter
@ 2007-06-12  1:41                 ` Nishanth Aravamudan
  2007-06-12  1:52                   ` Andrew Morton
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  1:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Christoph Lameter, lee.schermerhorn, anton, linux-mm

On 11.06.2007 [17:57:00 -0700], Andrew Morton wrote:
> On Mon, 11 Jun 2007 17:14:36 -0700 Nishanth Aravamudan <nacc@us.ibm.com> wrote:
> 
> > 
> > Christoph said:
> > "This does not work for the address based interleaving for anonymous
> > vmas.  I am not sure what to do there. We could change the calculation
> > of the node to be based only on nodes with memory and then skip the
> > memoryless ones. I have only added a comment to describe its brokennes
> > for now."
> > 
> > I have copied his draft's comment.
> > 
> > Change alloc_pages_node() to fail __GFP_THISNODE allocations if the node
> > is not populated.
> > 
> > Again, Christoph said:
> > "This will fix the alloc_pages_node case but not the alloc_pages() case.
> > In the alloc_pages() case we do not specify a node. Implicitly it is
> > understood that we (in the case of no memory policy / cpuset options)
> > allocate from the nearest node. So it may be argued there that the
> > GFP_THISNODE behavior of taking the first node from the zonelist is
> > okay."
> > 
> > Christoph was also worried about the performance impact on these paths,
> > as am I.
> > 
> > Finally, as he suggested, uninline alloc_pages_node() and move it to
> > mempolicy.c.
> > 
> 
> All confused.

<snip>

> I have no node_populated_mask.
> 
> The below improves the situation, but I wonder about, ahem, the maturity of
> this code.

Sorry, Andrew :(

I didn't expect you to pull all these patche so quickly. No one gave me
much feedback the last few times I posted the series, so I wasn't
expecting any this time either...that's what I get for pique-ing
Christoph's interest :) We went through several revisions today alone...

If you would prefer dropping the series, I will clean them up and get
them ready for you tomorrow.

The previous series were well-tested, but this one was more of a RFD/RFC
with an emphasis on the D/C. Sorry for that and not making it more
explicit.

How would you like me to proceed?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  0:41                 ` Christoph Lameter
@ 2007-06-12  1:43                   ` Nishanth Aravamudan
  2007-06-12  1:45                     ` Christoph Lameter
  2007-06-12  2:02                   ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  1:43 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [17:41:15 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > No need to initialize if we do not use it. You may to #ifdef it out
> > > by moving the definition. Please sent a diff against the earlier patch 
> > > since Andrew already merged it.
> > 
> > We will be using it (it == node_populated_mask) later in my sysfs patch
> > and in the fix hugepage allocation patch.
> 
> But not in the !NUMA case. So the definition of the node_populated_mask 
> can be moved into an #ifdef CONFIG_NUMA chunk in page_alloc.c and we can 
> have fallback functions.

Ah, but we'll use it in mpol_new via nodes_and() regardless of
NUMA/!NUMA, right?

I see no reason not make sure the node_populated_mask is sensible
whenever it can be.

If you really feel that only CONFIG_NUMA code should use
node_populated_mask, then I'll make that change and use node_populated()
in the callers.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  1:43                   ` Nishanth Aravamudan
@ 2007-06-12  1:45                     ` Christoph Lameter
  2007-06-12  1:52                       ` Nishanth Aravamudan
  2007-06-12  2:39                       ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  1:45 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Ah, but we'll use it in mpol_new via nodes_and() regardless of
> NUMA/!NUMA, right?

mempolicy.c will only be compiled for the NUMA case.

> If you really feel that only CONFIG_NUMA code should use
> node_populated_mask, then I'll make that change and use node_populated()
> in the callers.

What point would there be of !NUMA configurations using 
node_populated_mask()?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  1:45                     ` Christoph Lameter
@ 2007-06-12  1:52                       ` Nishanth Aravamudan
  2007-06-12  2:39                       ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  1:52 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [18:45:55 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Ah, but we'll use it in mpol_new via nodes_and() regardless of
> > NUMA/!NUMA, right?
> 
> mempolicy.c will only be compiled for the NUMA case.

Ah, I did not realize that, sorry.

> > If you really feel that only CONFIG_NUMA code should use
> > node_populated_mask, then I'll make that change and use
> > node_populated() in the callers.
> 
> What point would there be of !NUMA configurations using
> node_populated_mask()?

Well, I'm just trying to cover all my bases. I will rework the stack to
be better and closer to what you'd like.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-12  1:41                 ` Nishanth Aravamudan
@ 2007-06-12  1:52                   ` Andrew Morton
  2007-06-12  2:03                     ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Andrew Morton @ 2007-06-12  1:52 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Christoph Lameter, lee.schermerhorn, anton, linux-mm

On Mon, 11 Jun 2007 18:41:42 -0700 Nishanth Aravamudan <nacc@us.ibm.com> wrote:

> How would you like me to proceed?

I shall now go into hiding.  Send 'em when they're ready, OK?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  0:41                 ` Christoph Lameter
  2007-06-12  1:43                   ` Nishanth Aravamudan
@ 2007-06-12  2:02                   ` Nishanth Aravamudan
  2007-06-12  2:20                     ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:02 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [17:41:15 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > No need to initialize if we do not use it. You may to #ifdef it out
> > > by moving the definition. Please sent a diff against the earlier patch 
> > > since Andrew already merged it.
> > 
> > We will be using it (it == node_populated_mask) later in my sysfs patch
> > and in the fix hugepage allocation patch.
> 
> But not in the !NUMA case. So the definition of the node_populated_mask 
> can be moved into an #ifdef CONFIG_NUMA chunk in page_alloc.c and we can 
> have fallback functions.

No, see:

[PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 858c0b3..97ae1a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,13 +105,22 @@ static void free_huge_page(struct page *page)

 static int alloc_fresh_huge_page(void)
 {
-       static int nid = 0;
+       static int nid = -1;
        struct page *page;
-       page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
-                                       HUGETLB_PAGE_ORDER);
-       nid = next_node(nid, node_online_map);
-       if (nid == MAX_NUMNODES)
-               nid = first_node(node_online_map);
+       int start_nid;
+
+       if (nid < 0)
+               nid = first_node(node_populated_map);
+       start_nid = nid;
+
+       do {
+               page = alloc_pages_node(nid,
+                               GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+                               HUGETLB_PAGE_ORDER);
+               nid = next_node(nid, node_populated_map);
+               if (nid >= nr_node_ids)
+                       nid = first_node(node_populated_map);
+       } while (!page && nid != start_nid);
        if (page) {
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);

wherein alloc_huge_page() checks node_populated_map for each invocation of
alloc_huge_page_node(). And alloc_huge_page() does not depend on CONFIG_NUMA in
any way.

If you would prefer, I could make it use node_online_map like before and check
if the node is populated every time, but that seems silly if it's one line to
make the node_populated_map sensible in both NUMA and !NUMA cases, similar to
node_online_map.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2][RFC] Fix INTERLEAVE with memoryless nodes
  2007-06-12  1:52                   ` Andrew Morton
@ 2007-06-12  2:03                     ` Nishanth Aravamudan
  0 siblings, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Christoph Lameter, lee.schermerhorn, anton, linux-mm

On 11.06.2007 [18:52:26 -0700], Andrew Morton wrote:
> On Mon, 11 Jun 2007 18:41:42 -0700 Nishanth Aravamudan <nacc@us.ibm.com> wrote:
> 
> > How would you like me to proceed?
> 
> I shall now go into hiding.  Send 'em when they're ready, OK?

Thanks, will do. Sorry again.

-Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  0:47                 ` Christoph Lameter
@ 2007-06-12  2:12                   ` Nishanth Aravamudan
  2007-06-12  2:21                     ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:12 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On 11.06.2007 [17:47:41 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > On 11.06.2007 [16:17:47 -0700], Christoph Lameter wrote:
> > > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > > 
> > > > +	if (nid < 0)
> > > > +		nid = first_node(node_populated_map);
> > > 
> > > nid == 1 means local node? Or why do we check for nid < 0?
> > > 
> > > 	if (nid == 1)
> > > 		 nid = numa_node_id();
> > > 
> > > ?
> > 
> > No, nid is a static variable. So we initialize it to -1 to catch the
> > first time we go through the loop.
> > 
> > IIRC, we can't just set it to first_node(node_populated_map), because
> > it's a non-constant or something?
> 
> Sure, you can initialize a c variable from another. So drop the -1.

If I do:

static int nid = first_node(node_populated_map), I get:

mm/hugetlb.c:108: error: initializer element is not constant

??

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-11 23:42                 ` Christoph Lameter
  2007-06-12  0:19                   ` Nishanth Aravamudan
@ 2007-06-12  2:19                   ` Nishanth Aravamudan
  2007-06-12  2:22                     ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:19 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On 11.06.2007 [16:42:33 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> >  }
> >  static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
> >  
> > +#ifdef CONFIG_HUGETLB_PAGE
> > +static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
> > +				hugetlb_read_nr_hugepages_node,
> > +				hugetlb_write_nr_hugepages_node);
> > +#endif
> 
> Move the above to hugetlb.c?

Ok, if I do that, would you prefer I just add callbacks into hugetlb
code for register_node() and unregister_node() that are no-ops if
!CONFIG_HUGETLB_PAGE? That is, rather than

	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);

just call something like

	hugetlb_unregister_node()

? And similar for register? Otherwise, there are still going to be
ifdefs for the remove and add calls.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  2:02                   ` Nishanth Aravamudan
@ 2007-06-12  2:20                     ` Christoph Lameter
  2007-06-12  2:32                       ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:20 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes

There is no point in compiling the interleave logic for !NUMA. There needs 
to be some sort of !NUMA fallback in hugetlb. It would be better to call a 
interleave function in mempolicy.c that provides an appropriate shim for 
!NUMA.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  2:12                   ` Nishanth Aravamudan
@ 2007-06-12  2:21                     ` Christoph Lameter
  2007-06-12  2:25                       ` Christoph Lameter
  2007-06-12  2:33                       ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:21 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> static int nid = first_node(node_populated_map), I get:
> 
> mm/hugetlb.c:108: error: initializer element is not constant

Remove the static.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-12  2:19                   ` Nishanth Aravamudan
@ 2007-06-12  2:22                     ` Christoph Lameter
  2007-06-12  2:34                       ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:22 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Ok, if I do that, would you prefer I just add callbacks into hugetlb
> code for register_node() and unregister_node() that are no-ops if
> !CONFIG_HUGETLB_PAGE? That is, rather than
> 
> 	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
> 
> just call something like
> 
> 	hugetlb_unregister_node()
> 
> ? And similar for register? Otherwise, there are still going to be
> ifdefs for the remove and add calls.

Sounds good. Lets see the patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  2:21                     ` Christoph Lameter
@ 2007-06-12  2:25                       ` Christoph Lameter
  2007-06-12  2:34                         ` Nishanth Aravamudan
  2007-06-12  2:33                       ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:25 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm

On Mon, 11 Jun 2007, Christoph Lameter wrote:

> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > static int nid = first_node(node_populated_map), I get:
> > 
> > mm/hugetlb.c:108: error: initializer element is not constant
> 
> Remove the static.

Cutting down the CCs.

Removing static wont help if the variable is still global. You need to 
define a local variable. Then it can be initialized with a variable 
expression.
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-11 20:27 [PATCH] Add populated_map to account for memoryless nodes Nishanth Aravamudan, Lee Schermerhorn
  2007-06-11 21:25 ` Christoph Lameter
@ 2007-06-12  2:27 ` KAMEZAWA Hiroyuki
  2007-06-12  2:46   ` Nishanth Aravamudan
  2007-06-12  2:53   ` Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-06-12  2:27 UTC (permalink / raw)
  To: Nishanth Aravamudan, Lee Schermerhorn; +Cc: clameter, anton, akpm, linux-mm

On Mon, 11 Jun 2007 13:27:28 -0700
Nishanth Aravamudan <nacc@us.ibm.com>, Lee Schermerhorn <lee.schermerhorn@hp.com> wrote:

> Split up Lee and Anton's original patch
> (http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
> populated_map changes to go in on their own.
> 
> Add a populated_map nodemask to indicate a node has memory or not. We
> have run into a number of issues (in practice and in code) with
> assumptions about every node having memory. Having this nodemask allows
> us to fix these issues; in particular, THISNODE allocations will come
> from the node specified, only, and the INTERLEAVE policy will be able to
> do the right thing with memoryless nodes.
> 
Thank you, I like this work.

> +extern nodemask_t node_populated_map;
please add /* node has memory */ here.

I don't think "populated node" means "node-with-memory" if there is no comments.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  2:20                     ` Christoph Lameter
@ 2007-06-12  2:32                       ` Nishanth Aravamudan
  2007-06-12  2:54                         ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:32 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [19:20:58 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
> 
> There is no point in compiling the interleave logic for !NUMA. There
> needs to be some sort of !NUMA fallback in hugetlb. It would be better
> to call a interleave function in mempolicy.c that provides an
> appropriate shim for !NUMA.

Hrm, if !NUMA, is the nid of the only node guaranteed to be 0? If so, I
can just

Make alloc_fresh_huge_page() and other generic variants call into the
_node() versions with nid=0, if !NUMA.

Would that be ok?

I'm not sure what kind of interleave function you're thinking of that
could be in mempolicy.c? Note, this code used node_online_map before,
which was also overkill in !NUMA.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  2:21                     ` Christoph Lameter
  2007-06-12  2:25                       ` Christoph Lameter
@ 2007-06-12  2:33                       ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:33 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On 11.06.2007 [19:21:48 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > static int nid = first_node(node_populated_map), I get:
> > 
> > mm/hugetlb.c:108: error: initializer element is not constant
> 
> Remove the static.

Then every hugepage will be allocated on the first populated node!

The whole point of alloc_fresh_huge_page is that it round-robins
hugepages allocations among all populated nodes. Or it will once we fix
it :)

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  2:25                       ` Christoph Lameter
@ 2007-06-12  2:34                         ` Nishanth Aravamudan
  2007-06-12  2:55                           ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:34 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On 11.06.2007 [19:25:08 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Christoph Lameter wrote:
> 
> > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > static int nid = first_node(node_populated_map), I get:
> > > 
> > > mm/hugetlb.c:108: error: initializer element is not constant
> > 
> > Remove the static.
> 
> Cutting down the CCs.
> 
> Removing static wont help if the variable is still global. You need to 
> define a local variable. Then it can be initialized with a variable 
> expression.

What global?

nid is static to alloc_fresh_huge_page().

gcc says that the static variable (which *must* be static for the
current round-robin allocation method) cannot be initialized with a
non-constant (which first_node is).

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-12  2:22                     ` Christoph Lameter
@ 2007-06-12  2:34                       ` Nishanth Aravamudan
  0 siblings, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:34 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm, wli

On 11.06.2007 [19:22:43 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Ok, if I do that, would you prefer I just add callbacks into hugetlb
> > code for register_node() and unregister_node() that are no-ops if
> > !CONFIG_HUGETLB_PAGE? That is, rather than
> > 
> > 	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
> > 
> > just call something like
> > 
> > 	hugetlb_unregister_node()
> > 
> > ? And similar for register? Otherwise, there are still going to be
> > ifdefs for the remove and add calls.
> 
> Sounds good. Lets see the patch.

Of course, just wanted to make sure were on the same page.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  1:45                     ` Christoph Lameter
  2007-06-12  1:52                       ` Nishanth Aravamudan
@ 2007-06-12  2:39                       ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:39 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [18:45:55 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Ah, but we'll use it in mpol_new via nodes_and() regardless of
> > NUMA/!NUMA, right?
> 
> mempolicy.c will only be compiled for the NUMA case.
> 
> > If you really feel that only CONFIG_NUMA code should use
> > node_populated_mask, then I'll make that change and use
> > node_populated() in the callers.
> 
> What point would there be of !NUMA configurations using
> node_populated_mask()?

I really don't get this inconsistency...why do we export node_online_map
and node_possible_map in !NUMA configurations, then?

Note, node_online_mask is used currently in hugetlb.c to achieve
"interleaving" of fresh hugepages. And has no dependency on NUMA to be
compiled or not.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12  2:27 ` KAMEZAWA Hiroyuki
@ 2007-06-12  2:46   ` Nishanth Aravamudan
  2007-06-12  2:53   ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  2:46 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Lee Schermerhorn, clameter, anton, akpm, linux-mm

On 12.06.2007 [11:27:57 +0900], KAMEZAWA Hiroyuki wrote:
> On Mon, 11 Jun 2007 13:27:28 -0700
> Nishanth Aravamudan <nacc@us.ibm.com>, Lee Schermerhorn <lee.schermerhorn@hp.com> wrote:
> 
> > Split up Lee and Anton's original patch
> > (http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
> > populated_map changes to go in on their own.
> > 
> > Add a populated_map nodemask to indicate a node has memory or not. We
> > have run into a number of issues (in practice and in code) with
> > assumptions about every node having memory. Having this nodemask allows
> > us to fix these issues; in particular, THISNODE allocations will come
> > from the node specified, only, and the INTERLEAVE policy will be able to
> > do the right thing with memoryless nodes.
> > 
> Thank you, I like this work.

Thanks, I hope it is useful :)

> > +extern nodemask_t node_populated_map;
> please add /* node has memory */ here.
> 
> I don't think "populated node" means "node-with-memory" if there is no comments.

Good point, I'll send a small diff for Andrew to pick up when I refresh
the other patches tomorrow.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12  2:27 ` KAMEZAWA Hiroyuki
  2007-06-12  2:46   ` Nishanth Aravamudan
@ 2007-06-12  2:53   ` Christoph Lameter
  2007-06-12  3:04     ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:53 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Nishanth Aravamudan, Lee Schermerhorn, anton, akpm, linux-mm

On Tue, 12 Jun 2007, KAMEZAWA Hiroyuki wrote:

> On Mon, 11 Jun 2007 13:27:28 -0700
> Nishanth Aravamudan <nacc@us.ibm.com>, Lee Schermerhorn <lee.schermerhorn@hp.com> wrote:
> 
> > Split up Lee and Anton's original patch
> > (http://marc.info/?l=linux-mm&m=118133042025995&w=2), to allow for the
> > populated_map changes to go in on their own.
> > 
> > Add a populated_map nodemask to indicate a node has memory or not. We
> > have run into a number of issues (in practice and in code) with
> > assumptions about every node having memory. Having this nodemask allows
> > us to fix these issues; in particular, THISNODE allocations will come
> > from the node specified, only, and the INTERLEAVE policy will be able to
> > do the right thing with memoryless nodes.
> > 
> Thank you, I like this work.
> 
> > +extern nodemask_t node_populated_map;
> please add /* node has memory */ here.
> 
> I don't think "populated node" means "node-with-memory" if there is no comments.

What else could it mean?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  2:32                       ` Nishanth Aravamudan
@ 2007-06-12  2:54                         ` Christoph Lameter
  2007-06-12  3:20                           ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:54 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> On 11.06.2007 [19:20:58 -0700], Christoph Lameter wrote:
> > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
> > 
> > There is no point in compiling the interleave logic for !NUMA. There
> > needs to be some sort of !NUMA fallback in hugetlb. It would be better
> > to call a interleave function in mempolicy.c that provides an
> > appropriate shim for !NUMA.
> 
> Hrm, if !NUMA, is the nid of the only node guaranteed to be 0? If so, I
> can just

Yes.

> Make alloc_fresh_huge_page() and other generic variants call into the
> _node() versions with nid=0, if !NUMA.
> 
> Would that be ok?

I am not sure what you are up to. Just make sure that the changes are 
minimal. Look in the source code for other examples on how !NUMA 
situations were handled.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  2:34                         ` Nishanth Aravamudan
@ 2007-06-12  2:55                           ` Christoph Lameter
  2007-06-12  3:17                             ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  2:55 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> nid is static to alloc_fresh_huge_page().

Ahh did not see that. Can you not call simply into interleave() from 
mempolicy.c? It will get you the counter that you need.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12  2:53   ` Christoph Lameter
@ 2007-06-12  3:04     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 140+ messages in thread
From: KAMEZAWA Hiroyuki @ 2007-06-12  3:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, Lee Schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007 19:53:10 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> > Thank you, I like this work.
> > 
> > > +extern nodemask_t node_populated_map;
> > please add /* node has memory */ here.
> > 
> > I don't think "populated node" means "node-with-memory" if there is no comments.
> 
> What else could it mean?
> 
"a node has cpu(s) or device(s)" is not populated ?

-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  2:55                           ` Christoph Lameter
@ 2007-06-12  3:17                             ` Nishanth Aravamudan
  2007-06-12  3:19                               ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  3:17 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On 11.06.2007 [19:55:21 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > nid is static to alloc_fresh_huge_page().
> 
> Ahh did not see that. Can you not call simply into interleave() from 
> mempolicy.c? It will get you the counter that you need.

You just told me that mempolicy.c is built conditionally on NUMA.
alloc_fresh_huge_page() is not, it only depeonds on CONFIG_HUGETLB_PAGE!

The only interleave functions I see in mempolicy.c are:

interleave_nodes(), which takes a mempolicy, which I don't have in
hugetlb.c

interleave_nid(), which also takes a mempolicy

I guess I could try and use huge_zonelist(), but I don't see the point?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:17                             ` Nishanth Aravamudan
@ 2007-06-12  3:19                               ` Christoph Lameter
  2007-06-12  3:30                                 ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  3:19 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> > Ahh did not see that. Can you not call simply into interleave() from 
> > mempolicy.c? It will get you the counter that you need.
> 
> You just told me that mempolicy.c is built conditionally on NUMA.
> alloc_fresh_huge_page() is not, it only depeonds on CONFIG_HUGETLB_PAGE!

Well you just need to have the appropriate fallbacks defined in 
mempolicy.h

> The only interleave functions I see in mempolicy.c are:
> 
> interleave_nodes(), which takes a mempolicy, which I don't have in
> hugetlb.c
> 
> interleave_nid(), which also takes a mempolicy
> 
> I guess I could try and use huge_zonelist(), but I don't see the point?

Export a function for the interleave functionality so that we do not have 
to replicate the same thing in various locations in the kernel.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  2:54                         ` Christoph Lameter
@ 2007-06-12  3:20                           ` Nishanth Aravamudan
  2007-06-12  3:21                             ` Christoph Lameter
  2007-06-12 15:06                             ` Lee Schermerhorn
  0 siblings, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  3:20 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [19:54:13 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > On 11.06.2007 [19:20:58 -0700], Christoph Lameter wrote:
> > > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > > 
> > > > [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
> > > 
> > > There is no point in compiling the interleave logic for !NUMA.
> > > There needs to be some sort of !NUMA fallback in hugetlb. It would
> > > be better to call a interleave function in mempolicy.c that
> > > provides an appropriate shim for !NUMA.
> > 
> > Hrm, if !NUMA, is the nid of the only node guaranteed to be 0? If so, I
> > can just
> 
> Yes.
> 
> > Make alloc_fresh_huge_page() and other generic variants call into
> > the _node() versions with nid=0, if !NUMA.
> > 
> > Would that be ok?
> 
> I am not sure what you are up to. Just make sure that the changes are
> minimal. Look in the source code for other examples on how !NUMA
> situations were handled.

I swear I'm trying to make the code do the right thing, and understand
the NUMA intricacies better. Sorry for the flood of e-mails and such. I
asked about specific other cases because they are used in !NUMA
situations too and I wasn't sure why node_populated_map should be
different.

But ok, I will rely on the source to be correct and make my changelog
indicate where I got the ideas from.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  3:20                           ` Nishanth Aravamudan
@ 2007-06-12  3:21                             ` Christoph Lameter
  2007-06-12  3:31                               ` Nishanth Aravamudan
  2007-06-12 15:06                             ` Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  3:21 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> > I am not sure what you are up to. Just make sure that the changes are
> > minimal. Look in the source code for other examples on how !NUMA
> > situations were handled.
> 
> I swear I'm trying to make the code do the right thing, and understand
> the NUMA intricacies better. Sorry for the flood of e-mails and such. I
> asked about specific other cases because they are used in !NUMA
> situations too and I wasn't sure why node_populated_map should be
> different.
> 
> But ok, I will rely on the source to be correct and make my changelog
> indicate where I got the ideas from.

Ok. I just hope this crash course in Linux NUMA is useful and you keep on 
working on NUMA....

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:19                               ` Christoph Lameter
@ 2007-06-12  3:30                                 ` Nishanth Aravamudan
  2007-06-12  3:48                                   ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  3:30 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On 11.06.2007 [20:19:24 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > Ahh did not see that. Can you not call simply into interleave() from 
> > > mempolicy.c? It will get you the counter that you need.
> > 
> > You just told me that mempolicy.c is built conditionally on NUMA.
> > alloc_fresh_huge_page() is not, it only depeonds on CONFIG_HUGETLB_PAGE!
> 
> Well you just need to have the appropriate fallbacks defined in
> mempolicy.h

Ok, I understand that.

> > The only interleave functions I see in mempolicy.c are:
> > 
> > interleave_nodes(), which takes a mempolicy, which I don't have in
> > hugetlb.c
> > 
> > interleave_nid(), which also takes a mempolicy
> > 
> > I guess I could try and use huge_zonelist(), but I don't see the point?
> 
> Export a function for the interleave functionality so that we do not
> have to replicate the same thing in various locations in the kernel.

But I don't understand this at all.

This is *not* generically available, unless every caller has its own
private static variable. I don't know how to do that in C.

You're asking me to complicate patches that work just fine right now.
Well, excluding a hasty patch that I didn't compile-test. All I'm trying
to do is ask for some guidance.

What we have here is:

alloc_fresh_huge_page() should return 1 after a successful allocation of
a huge page on a different node, in a round-robin fashion, on every
invocation; or 0, if no huge page could be allocated.

I don't see how to make that generic in a simple way. It relies on an
interator that is private to the function, not to any structure. And
this is really a one-time allocation right now (hugepages).

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  3:21                             ` Christoph Lameter
@ 2007-06-12  3:31                               ` Nishanth Aravamudan
  0 siblings, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  3:31 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [20:21:58 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > I am not sure what you are up to. Just make sure that the changes are
> > > minimal. Look in the source code for other examples on how !NUMA
> > > situations were handled.
> > 
> > I swear I'm trying to make the code do the right thing, and understand
> > the NUMA intricacies better. Sorry for the flood of e-mails and such. I
> > asked about specific other cases because they are used in !NUMA
> > situations too and I wasn't sure why node_populated_map should be
> > different.
> > 
> > But ok, I will rely on the source to be correct and make my changelog
> > indicate where I got the ideas from.
> 
> Ok. I just hope this crash course in Linux NUMA is useful and you keep on 
> working on NUMA....

I'll try, at least.

Thanks for the patience!

-Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  0:15               ` Nishanth Aravamudan
  2007-06-12  0:47                 ` Christoph Lameter
@ 2007-06-12  3:44                 ` William Lee Irwin III
  2007-06-12  3:50                   ` Christoph Lameter
  2007-06-12  5:09                   ` Nishanth Aravamudan
  1 sibling, 2 replies; 140+ messages in thread
From: William Lee Irwin III @ 2007-06-12  3:44 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [16:17:47 -0700], Christoph Lameter wrote:
>> nid == 1 means local node? Or why do we check for nid < 0?
>> 	if (nid == 1)
>> 		 nid = numa_node_id();
>> ?

On Mon, Jun 11, 2007 at 05:15:42PM -0700, Nishanth Aravamudan wrote:
> No, nid is a static variable. So we initialize it to -1 to catch the
> first time we go through the loop.
> IIRC, we can't just set it to first_node(node_populated_map), because
> it's a non-constant or something?

I wrote that, so I figure I should chime in. The static variable can
be killed off outright.

Initially filling the pool doesn't need the static affair. Refilling
the pool from the page allocator can refill the node with the least
memory first, and choose randomly otherwise. Using default mpolicies
or defaulting to node-local memory instead of round-robin allocation
will likely do for callers into the allocator.

It depends a bit on what SGI's app that originally wanted striping of
hugetlb does.

Also, if one has such a large number of nodes that exhaustive search
for the node with the least memory would be prohibitive, esp. when in
a loop, it's always possible to keep node ID's in an array heap-ordered
by the number of pages in the node's segment of the pool. In such a
manner the inner loop's search is limited to O(lg(nr_online_nodes())).

-- wli

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:30                                 ` Nishanth Aravamudan
@ 2007-06-12  3:48                                   ` Christoph Lameter
  2007-06-12  5:07                                     ` Nishanth Aravamudan
  2007-06-12 17:43                                     ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  3:48 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> > Export a function for the interleave functionality so that we do not
> > have to replicate the same thing in various locations in the kernel.
> 
> But I don't understand this at all.
> 
> This is *not* generically available, unless every caller has its own
> private static variable. I don't know how to do that in C.

It is already there. Each task has a il_next field in its task struct for 
that purpose.

> You're asking me to complicate patches that work just fine right now.

I am trying to simplify your work.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:44                 ` William Lee Irwin III
@ 2007-06-12  3:50                   ` Christoph Lameter
  2007-06-12  3:53                     ` William Lee Irwin III
  2007-06-12  5:09                   ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  3:50 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Nishanth Aravamudan, lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, William Lee Irwin III wrote:

> I wrote that, so I figure I should chime in. The static variable can
> be killed off outright.

I agree.

> Initially filling the pool doesn't need the static affair. Refilling
> the pool from the page allocator can refill the node with the least
> memory first, and choose randomly otherwise. Using default mpolicies
> or defaulting to node-local memory instead of round-robin allocation
> will likely do for callers into the allocator.

Each task already has a next node field. Just use that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:50                   ` Christoph Lameter
@ 2007-06-12  3:53                     ` William Lee Irwin III
  2007-06-12  3:53                       ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: William Lee Irwin III @ 2007-06-12  3:53 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, William Lee Irwin III wrote:
>> Initially filling the pool doesn't need the static affair. Refilling
>> the pool from the page allocator can refill the node with the least
>> memory first, and choose randomly otherwise. Using default mpolicies
>> or defaulting to node-local memory instead of round-robin allocation
>> will likely do for callers into the allocator.

On Mon, Jun 11, 2007 at 08:50:49PM -0700, Christoph Lameter wrote:
> Each task already has a next node field. Just use that.

That's new. It sounds convenient.


-- wli

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:53                     ` William Lee Irwin III
@ 2007-06-12  3:53                       ` Christoph Lameter
  2007-06-12  4:14                         ` William Lee Irwin III
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12  3:53 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Nishanth Aravamudan, lee.schermerhorn, anton, akpm, linux-mm

On Mon, 11 Jun 2007, William Lee Irwin III wrote:

> On Mon, Jun 11, 2007 at 08:50:49PM -0700, Christoph Lameter wrote:
> > Each task already has a next node field. Just use that.
> 
> That's new. It sounds convenient.

No its ancient.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:53                       ` Christoph Lameter
@ 2007-06-12  4:14                         ` William Lee Irwin III
  0 siblings, 0 replies; 140+ messages in thread
From: William Lee Irwin III @ 2007-06-12  4:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, lee.schermerhorn, anton, akpm, linux-mm

On Mon, Jun 11, 2007 at 08:50:49PM -0700, Christoph Lameter wrote:
>>> Each task already has a next node field. Just use that.

On Mon, 11 Jun 2007, William Lee Irwin III wrote:
>> That's new. It sounds convenient.

On Mon, Jun 11, 2007 at 08:53:31PM -0700, Christoph Lameter wrote:
> No its ancient.

Heh. It all depends on your view of time. One's point of view tends
toward geologic when 2.4.9 (not a typo) is still current for a number
of one's customers. Not to mention when one maintains code (or attempts
to, however poorly) with open bugs where the last known working
versions are in the 2.0.x and 2.2.x version spaces.

Shiny new code from 2005 can indeed be a breath of fresh air to some.


-- wli

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:48                                   ` Christoph Lameter
@ 2007-06-12  5:07                                     ` Nishanth Aravamudan
  2007-06-12 18:47                                       ` Christoph Lameter
  2007-06-12 17:43                                     ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  5:07 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On 11.06.2007 [20:48:08 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > Export a function for the interleave functionality so that we do not
> > > have to replicate the same thing in various locations in the kernel.
> > 
> > But I don't understand this at all.
> > 
> > This is *not* generically available, unless every caller has its own
> > private static variable. I don't know how to do that in C.
> 
> It is already there. Each task has a il_next field in its task struct
> for that purpose.

Hrm, maybe that will work -- but then it means that if one is
interleaving huge pages, it will interfere with the interleaving of
small pages. Given that right now, huge pages are a rather precious
commodity, do we want this?

> > You're asking me to complicate patches that work just fine right now.
> 
> I am trying to simplify your work.

Sorry, I wasn't trying to sound unappreciative. Your suggestions are
very valuable!

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:44                 ` William Lee Irwin III
  2007-06-12  3:50                   ` Christoph Lameter
@ 2007-06-12  5:09                   ` Nishanth Aravamudan
  2007-06-12  5:15                     ` William Lee Irwin III
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12  5:09 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [20:44:07 -0700], William Lee Irwin III wrote:
> On 11.06.2007 [16:17:47 -0700], Christoph Lameter wrote:
> >> nid == 1 means local node? Or why do we check for nid < 0?
> >> 	if (nid == 1)
> >> 		 nid = numa_node_id();
> >> ?
> 
> On Mon, Jun 11, 2007 at 05:15:42PM -0700, Nishanth Aravamudan wrote:
> > No, nid is a static variable. So we initialize it to -1 to catch the
> > first time we go through the loop.
> > IIRC, we can't just set it to first_node(node_populated_map), because
> > it's a non-constant or something?
> 
> I wrote that, so I figure I should chime in. The static variable can
> be killed off outright.
> 
> Initially filling the pool doesn't need the static affair. Refilling
> the pool from the page allocator can refill the node with the least
> memory first, and choose randomly otherwise. Using default mpolicies
> or defaulting to node-local memory instead of round-robin allocation
> will likely do for callers into the allocator.
> 
> It depends a bit on what SGI's app that originally wanted striping of
> hugetlb does.
> 
> Also, if one has such a large number of nodes that exhaustive search
> for the node with the least memory would be prohibitive, esp. when in
> a loop, it's always possible to keep node ID's in an array
> heap-ordered by the number of pages in the node's segment of the pool.
> In such a manner the inner loop's search is limited to
> O(lg(nr_online_nodes())).

Well, (presuming I understood everything you wrote :), don't we need the
static 'affair' to guarantee the initial allocations are approximately
round-robin? Or, if we aren't going to make that guarantee, than we
should only change that once my sysfs allocator (or its equivalent) is
available?

Just trying to get a handle on what you're suggesting without any
historical context.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  5:09                   ` Nishanth Aravamudan
@ 2007-06-12  5:15                     ` William Lee Irwin III
  2007-06-12 17:36                       ` Nishanth Aravamudan
  2007-06-12 17:45                       ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: William Lee Irwin III @ 2007-06-12  5:15 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On Mon, Jun 11, 2007 at 10:09:10PM -0700, Nishanth Aravamudan wrote:
> Well, (presuming I understood everything you wrote :), don't we need the
> static 'affair' to guarantee the initial allocations are approximately
> round-robin? Or, if we aren't going to make that guarantee, than we
> should only change that once my sysfs allocator (or its equivalent) is
> available?
> Just trying to get a handle on what you're suggesting without any
> historical context.

For initially filling the pool one can just loop over nid's modulo the
number of populated nodes and pass down a stack-allocated variable.


-- wli

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-11 21:25 ` Christoph Lameter
  2007-06-11 22:10   ` [PATCH v2] " Nishanth Aravamudan
@ 2007-06-12 14:10   ` Lee Schermerhorn
  2007-06-12 17:35     ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 14:10 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Andi Kleen

On Mon, 2007-06-11 at 14:25 -0700, Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > @@ -2161,7 +2164,7 @@ static int node_order[MAX_NUMNODES];
> >  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
> >  {
> >  	enum zone_type i;
> > -	int pos, j, node;
> > +	int pos, j;
> >  	int zone_type;		/* needs to be signed */
> >  	struct zone *z;
> >  	struct zonelist *zonelist;
> > @@ -2171,7 +2174,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
> >  		pos = 0;
> >  		for (zone_type = i; zone_type >= 0; zone_type--) {
> >  			for (j = 0; j < nr_nodes; j++) {
> > -				node = node_order[j];
> > +				int node = node_order[j];
> >  				z = &NODE_DATA(node)->node_zones[zone_type];
> >  				if (populated_zone(z)) {
> >  					zonelist->zones[pos++] = z;
> 
> Unrelated modifications.
> 
> > @@ -2244,6 +2247,22 @@ static void set_zonelist_order(void)
> >  		current_zonelist_order = user_zonelist_order;
> >  }
> >  
> > +/*
> > + * setup_populate_map() - record nodes whose "policy_zone" is "on-node".
> > + */
> > +static void setup_populated_map(int nid)
> > +{
> > +	pg_data_t *pgdat = NODE_DATA(nid);
> > +	struct zonelist *zl = pgdat->node_zonelists + policy_zone;
> > +	struct zone *z = zl->zones[0];
> > +
> > +	VM_BUG_ON(!z);
> > +	if (z->zone_pgdat == pgdat)
> > +		node_set_populated(nid);
> > +	else
> > +		node_not_populated(nid);
> > +}
> 
> 
> A node is only populated if it has memory in the policy zone? I would say 
> a node is populated if it has any memory in any zone.

Mea culpa.  Our platforms have a [pseudo-]node with just O(1G) memory
all in zone DMA.  That node can't look populated for allocating huge
pages.

> 
> The above check may fail on x86_64 where only some nodes may have 
> ZONE_NORMAL. Others only have ZONE_DMA32. Policy zone will be set to 
> ZONE_NORMAL.

Yes.  I thought of this after I created the patch.  I've been looking
for a platform with exactly 4GB per node to test on.  I believe that, on
our platforms, all of node zero would be in zone DMA32 and all other
nodes would be > DMA32.  

Maybe we can just exclude zone DMA from the populated map?


> 
> 
> > +
> >  static void build_zonelists(pg_data_t *pgdat)
> >  {
> >  	int j, node, load;
> > @@ -2327,6 +2346,15 @@ static void set_zonelist_order(void)
> >  	current_zonelist_order = ZONELIST_ORDER_ZONE;
> >  }
> >  
> > +/*
> > + * setup_populated_map - non-NUMA case
> > + * Only node 0 should be on-line, and it MUST be populated!
> > + */
> > +static void setup_populated_map(int nid)
> > +{
> > +	node_set_populated(nid);
> > +}
> 
> I'd say provide fallback functions so that node_populated() always returns 
> true for !NUMA. That way it can be optimized out at compile time.
> 
> >  static void build_zonelists(pg_data_t *pgdat)
> >  {
> >  	int node, local_node;
> I'd say provide fallback functions so that node_populated() always returns 
> true for !NUMA. That way it can be optimized out at compile time.
> > @@ -2381,6 +2409,7 @@ static int __build_all_zonelists(void *dummy)
> >  	for_each_online_node(nid) {
> >  		build_zonelists(NODE_DATA(nid));
> >  		build_zonelist_cache(NODE_DATA(nid));
> > +		setup_populated_map(nid);
> >  	}
> 
> Is it possible to move the set_populated_node into build_zonelists 
> somehow?
> 
> F.e. In build_zonelists_node you can check if nr_zones > 0 and then set it 
> up?
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-11 22:42     ` Christoph Lameter
  2007-06-11 22:52       ` [PATCH v3] " Nishanth Aravamudan
@ 2007-06-12 14:19       ` Lee Schermerhorn
  2007-06-12 17:32         ` Nishanth Aravamudan
  2007-06-12 18:45         ` Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 14:19 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Mon, 2007-06-11 at 15:42 -0700, Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Already done in the original patch (node_populated() returns (node == 0)
> > if MAX_NUMODES <= 1), I think.
> 
> Ah good.
> 
> > @@ -2299,6 +2303,18 @@ static void build_zonelists(pg_data_t *pgdat)
> >  		/* calculate node order -- i.e., DMA last! */
> >  		build_zonelists_in_zone_order(pgdat, j);
> >  	}
> > +
> > +	/*
> > +	 * record nodes whose first fallback zone is "on-node" as
> > +	 * populated
> > +	 */
> > +	z = pgdat->node_zonelists->zones[0];
> > +
> > +	VM_BUG_ON(!z);
> > +	if (z->zone_pgdat == pgdat)
> > +		node_set_populated(local_node);
> > +	else
> > +		node_not_populated(local_node);
> >  }
> >  
> >  /* Construct the zonelist performance cache - see further mmzone.h */
> > 
> 
> Could be much simpler:
> 
> if (pgdat->node_present_pages)
> 	node_set_populated(local_node);

As a minimum, we need to exclude a node with only zone DMA memory for
this to work on our platforms.  For that, I think the current code is
the simplest because we still need to check if the first zone is
"on-node" and !DMA.

And, I think we need both cases--set and reset populated map bit--to
handle memory/node hotplug.  So something like:

	if (z->zone_pgdat == pgdat && !is_zone_dma(z))
		node_set_populated(local_node);
	else
		node_not_populated(local_node);

Need to define 'is_zone-dma()' to test the zone or unconditionally
return false depending on whether ZONE_DMA is configured.


I will repost Nish's repost to "fix" this.

Lee
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-11 23:17             ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Christoph Lameter
  2007-06-12  0:15               ` Nishanth Aravamudan
@ 2007-06-12 14:28               ` Lee Schermerhorn
  1 sibling, 0 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 14:28 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm, wli

On Mon, 2007-06-11 at 16:17 -0700, Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > +	if (nid < 0)
> > +		nid = first_node(node_populated_map);
> 
> nid == 1 means local node? Or why do we check for nid < 0?
> 
> 	if (nid == 1)
> 		 nid = numa_node_id();

That's not what it's doing.  alloc_fresh_huge_page() is an incremental
allocator.  Keeps track of where it left off using a static variable.
Because I changed it to scan a node map [the populated map], I needed to
fetch the "first_node()" the first time it's called.  Thus the initial
value of -1.  Thereafter, alloc_fresh_huge_page() just cycles around the
populated map.

> 
> ?
> > +	do {
> > +		page = alloc_pages_node(nid,
> > +				GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
> > +				HUGETLB_PAGE_ORDER);
> > +		nid = next_node(nid, node_populated_map);
> > +		if (nid >= nr_node_ids)
> > +			nid = first_node(node_populated_map);
> > +	} while (!page && nid != start_nid);
> 
> Looks good.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12  3:20                           ` Nishanth Aravamudan
  2007-06-12  3:21                             ` Christoph Lameter
@ 2007-06-12 15:06                             ` Lee Schermerhorn
  2007-06-12 17:28                               ` Nishanth Aravamudan
  2007-06-12 18:41                               ` Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 15:06 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Christoph Lameter, anton, akpm, linux-mm

On Mon, 2007-06-11 at 20:20 -0700, Nishanth Aravamudan wrote:
> On 11.06.2007 [19:54:13 -0700], Christoph Lameter wrote:
> > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > On 11.06.2007 [19:20:58 -0700], Christoph Lameter wrote:
> > > > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > > > 
> > > > > [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
> > > > 
> > > > There is no point in compiling the interleave logic for !NUMA.
> > > > There needs to be some sort of !NUMA fallback in hugetlb. It would
> > > > be better to call a interleave function in mempolicy.c that
> > > > provides an appropriate shim for !NUMA.
> > > 
> > > Hrm, if !NUMA, is the nid of the only node guaranteed to be 0? If so, I
> > > can just
> > 
> > Yes.
> > 
> > > Make alloc_fresh_huge_page() and other generic variants call into
> > > the _node() versions with nid=0, if !NUMA.
> > > 
> > > Would that be ok?
> > 
> > I am not sure what you are up to. Just make sure that the changes are
> > minimal. Look in the source code for other examples on how !NUMA
> > situations were handled.
> 
> I swear I'm trying to make the code do the right thing, and understand
> the NUMA intricacies better. Sorry for the flood of e-mails and such. I
> asked about specific other cases because they are used in !NUMA
> situations too and I wasn't sure why node_populated_map should be
> different.
> 
> But ok, I will rely on the source to be correct and make my changelog
> indicate where I got the ideas from.

Nish:  when this all settles down, I still need to make sure it works on
our platforms with the funny DMA-only node.  What that comes down to is
that when alloc_fresh_huge_page() calls:

		page = alloc_pages_node(nid,
                               GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
                               HUGETLB_PAGE_ORDER);

I need to get a page that is on nid.  On our platform, GFP_HIGHUSER is
going to specify the zonelist for ZONE_NORMAL.  The first zone on this
list needs to be on-node for nid.  With the changes you've made to the
definition of populated map, I think this won't be the case.  I need to
test your latest patches and fix that, if it's broken.

I still think using policy zone is the "right way" to go, here.  After
all, only pages in the policy zone are controlled by policy, and that's
the goal of spreading out the huge pages across nodes--to make them
available to satisfy memory policy at allocation time.  But that would
need some adjustments for x86_64 systems that have some nodes that are
all/mostly DMA32 and other nodes that are populated in zones > DMA32, if
we want to allocate huge pages out of the DMA32 zone.   

As far as the static variable, and round-robin allocation:  the current
method "works" both for huge pages allocated at boot time and for huge
pages allocated at run-time vi the vm.nr_hugepages sysctl.  By "works",
I mean that it continues to spread the pages evenly across the
"populated" nodes.  If, however, you use the task local counter to
interleave fresh huge pages, each write to the nr_hugepages from a
different task ["echo NN >.../nr_hugepages"] will start at node zero or
the first populated node--assuming you're interleaving across populated
nodes and not on-line nodes.  That's probably OK if you always change
nr_hugepages by a multiple of the number of populated nodes.  And, if
things get out of balance, we'll have your per node attribute, I hope,
to adjust any individual node.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 15:06                             ` Lee Schermerhorn
@ 2007-06-12 17:28                               ` Nishanth Aravamudan
  2007-06-12 18:43                                 ` Christoph Lameter
  2007-06-12 18:48                                 ` Lee Schermerhorn
  2007-06-12 18:41                               ` Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 17:28 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm

On 12.06.2007 [11:06:22 -0400], Lee Schermerhorn wrote:
> On Mon, 2007-06-11 at 20:20 -0700, Nishanth Aravamudan wrote:
> > On 11.06.2007 [19:54:13 -0700], Christoph Lameter wrote:
> > > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > > 
> > > > On 11.06.2007 [19:20:58 -0700], Christoph Lameter wrote:
> > > > > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > > > > 
> > > > > > [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
> > > > > 
> > > > > There is no point in compiling the interleave logic for !NUMA.
> > > > > There needs to be some sort of !NUMA fallback in hugetlb. It would
> > > > > be better to call a interleave function in mempolicy.c that
> > > > > provides an appropriate shim for !NUMA.
> > > > 
> > > > Hrm, if !NUMA, is the nid of the only node guaranteed to be 0? If so, I
> > > > can just
> > > 
> > > Yes.
> > > 
> > > > Make alloc_fresh_huge_page() and other generic variants call into
> > > > the _node() versions with nid=0, if !NUMA.
> > > > 
> > > > Would that be ok?
> > > 
> > > I am not sure what you are up to. Just make sure that the changes are
> > > minimal. Look in the source code for other examples on how !NUMA
> > > situations were handled.
> > 
> > I swear I'm trying to make the code do the right thing, and understand
> > the NUMA intricacies better. Sorry for the flood of e-mails and such. I
> > asked about specific other cases because they are used in !NUMA
> > situations too and I wasn't sure why node_populated_map should be
> > different.
> > 
> > But ok, I will rely on the source to be correct and make my changelog
> > indicate where I got the ideas from.
> 
> Nish:  when this all settles down, I still need to make sure it works
> on our platforms with the funny DMA-only node.  What that comes down
> to is that when alloc_fresh_huge_page() calls:

Ok, thanks for these details.

Would you be ok with stabilizing the generic definition of
node_populated_map as is (any present pages, regardless of location),
and then trying to figure out how to get your platform to work with
that?

> 		page = alloc_pages_node(nid,
>                                GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
>                                HUGETLB_PAGE_ORDER);
> 
> I need to get a page that is on nid.  On our platform, GFP_HIGHUSER is
> going to specify the zonelist for ZONE_NORMAL.  The first zone on this
> list needs to be on-node for nid.  With the changes you've made to the
> definition of populated map, I think this won't be the case.  I need
> to test your latest patches and fix that, if it's broken.

Ok. But that means your platform is broken now too, right? As in, it's
not a regression, per se?

I'm much more concerned in the short term about the whole
memoryless-node issue, which I think is more straight-forward, and
generic to fix.

> I still think using policy zone is the "right way" to go, here.  After
> all, only pages in the policy zone are controlled by policy, and
> that's the goal of spreading out the huge pages across nodes--to make
> them available to satisfy memory policy at allocation time.  But that
> would need some adjustments for x86_64 systems that have some nodes
> that are all/mostly DMA32 and other nodes that are populated in zones
> > DMA32, if we want to allocate huge pages out of the DMA32 zone.   

Well, as of right now, I'm *only* trying to deal with memoryless nodes.
So then this whole notion of policy_zone is relatively moot. It matters
for your platform, I understand, but I think the fix there is more
complex and probably should be stacked on the current set, once it is
stabilized.

> As far as the static variable, and round-robin allocation:  the current
> method "works" both for huge pages allocated at boot time and for huge
> pages allocated at run-time vi the vm.nr_hugepages sysctl.  By "works",
> I mean that it continues to spread the pages evenly across the
> "populated" nodes.  If, however, you use the task local counter to
> interleave fresh huge pages, each write to the nr_hugepages from a
> different task ["echo NN >.../nr_hugepages"] will start at node zero or
> the first populated node--assuming you're interleaving across populated
> nodes and not on-line nodes.  That's probably OK if you always change
> nr_hugepages by a multiple of the number of populated nodes.  And, if
> things get out of balance, we'll have your per node attribute, I hope,
> to adjust any individual node.

Yes, I will reply about the il_next thing in a sec. Maybe Christoph has
some cleverness.

And yes, I think the per-node attribute will fix most of the interface
problems for 'odd' NUMA systems.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 14:19       ` [PATCH v2] Add populated_map to account for " Lee Schermerhorn
@ 2007-06-12 17:32         ` Nishanth Aravamudan
  2007-06-12 18:45         ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 17:32 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Christoph Lameter, anton, akpm, linux-mm, Kamezawa Hiroyuki

On 12.06.2007 [10:19:00 -0400], Lee Schermerhorn wrote:
> On Mon, 2007-06-11 at 15:42 -0700, Christoph Lameter wrote:
> > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > Already done in the original patch (node_populated() returns (node == 0)
> > > if MAX_NUMODES <= 1), I think.
> > 
> > Ah good.
> > 
> > > @@ -2299,6 +2303,18 @@ static void build_zonelists(pg_data_t *pgdat)
> > >  		/* calculate node order -- i.e., DMA last! */
> > >  		build_zonelists_in_zone_order(pgdat, j);
> > >  	}
> > > +
> > > +	/*
> > > +	 * record nodes whose first fallback zone is "on-node" as
> > > +	 * populated
> > > +	 */
> > > +	z = pgdat->node_zonelists->zones[0];
> > > +
> > > +	VM_BUG_ON(!z);
> > > +	if (z->zone_pgdat == pgdat)
> > > +		node_set_populated(local_node);
> > > +	else
> > > +		node_not_populated(local_node);
> > >  }
> > >  
> > >  /* Construct the zonelist performance cache - see further mmzone.h */
> > > 
> > 
> > Could be much simpler:
> > 
> > if (pgdat->node_present_pages)
> > 	node_set_populated(local_node);
> 
> As a minimum, we need to exclude a node with only zone DMA memory for
> this to work on our platforms.  For that, I think the current code is
> the simplest because we still need to check if the first zone is
> "on-node" and !DMA.
> 
> And, I think we need both cases--set and reset populated map bit--to
> handle memory/node hotplug.  So something like:

That's a good point -- build_zonelists() will get called for the
rebuild, but won't remove nodes from the populated_map. Admittedly, only
hot-add is currently supported, right?

> 	if (z->zone_pgdat == pgdat && !is_zone_dma(z))
> 		node_set_populated(local_node);
> 	else
> 		node_not_populated(local_node);

Hrm, but then node_populated == node has non-DMA pages, which is
altogether unintuitive. Again, I think this obfuscates things -- perhaps
the map should be renamed to something closer to what you actually want
it to represent?

> Need to define 'is_zone-dma()' to test the zone or unconditionally
> return false depending on whether ZONE_DMA is configured.

@Andrew: would you be ok dropping the populated_map patches while I hammer
out whether it's what we want with Lee; and decide whether the fix-patch
on top is needed, as well, based on Christoph's feedback?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12 14:10   ` [PATCH] " Lee Schermerhorn
@ 2007-06-12 17:35     ` Nishanth Aravamudan
  2007-06-12 18:39       ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 17:35 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm, Andi Kleen

On 12.06.2007 [10:10:33 -0400], Lee Schermerhorn wrote:
> On Mon, 2007-06-11 at 14:25 -0700, Christoph Lameter wrote:
> > On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > @@ -2161,7 +2164,7 @@ static int node_order[MAX_NUMNODES];
> > >  static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
> > >  {
> > >  	enum zone_type i;
> > > -	int pos, j, node;
> > > +	int pos, j;
> > >  	int zone_type;		/* needs to be signed */
> > >  	struct zone *z;
> > >  	struct zonelist *zonelist;
> > > @@ -2171,7 +2174,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
> > >  		pos = 0;
> > >  		for (zone_type = i; zone_type >= 0; zone_type--) {
> > >  			for (j = 0; j < nr_nodes; j++) {
> > > -				node = node_order[j];
> > > +				int node = node_order[j];
> > >  				z = &NODE_DATA(node)->node_zones[zone_type];
> > >  				if (populated_zone(z)) {
> > >  					zonelist->zones[pos++] = z;
> > 
> > Unrelated modifications.
> > 
> > > @@ -2244,6 +2247,22 @@ static void set_zonelist_order(void)
> > >  		current_zonelist_order = user_zonelist_order;
> > >  }
> > >  
> > > +/*
> > > + * setup_populate_map() - record nodes whose "policy_zone" is "on-node".
> > > + */
> > > +static void setup_populated_map(int nid)
> > > +{
> > > +	pg_data_t *pgdat = NODE_DATA(nid);
> > > +	struct zonelist *zl = pgdat->node_zonelists + policy_zone;
> > > +	struct zone *z = zl->zones[0];
> > > +
> > > +	VM_BUG_ON(!z);
> > > +	if (z->zone_pgdat == pgdat)
> > > +		node_set_populated(nid);
> > > +	else
> > > +		node_not_populated(nid);
> > > +}
> > 
> > 
> > A node is only populated if it has memory in the policy zone? I
> > would say a node is populated if it has any memory in any zone.
> 
> Mea culpa.  Our platforms have a [pseudo-]node with just O(1G) memory
> all in zone DMA.  That node can't look populated for allocating huge
> pages.

Because you don't want to use up any of the DMA pages, right? That seems
*very* platform specific. And it doesn't seem right to make common code
more complicated for one platform. Maybe there isn't a better solution,
but I'd like to mull it over.

> > The above check may fail on x86_64 where only some nodes may have 
> > ZONE_NORMAL. Others only have ZONE_DMA32. Policy zone will be set to 
> > ZONE_NORMAL.
> 
> Yes.  I thought of this after I created the patch.  I've been looking
> for a platform with exactly 4GB per node to test on.  I believe that,
> on our platforms, all of node zero would be in zone DMA32 and all
> other nodes would be > DMA32.  
> 
> Maybe we can just exclude zone DMA from the populated map?

Maybe I don't know enough about NUMA and such, but I'm not sure I
understand how this would make it a populated map anymore?

Maybe we need two maps, really?

One is for nodes that have memory, period (pages_present) ==
populated_map as currently implemented.

Another is for nodes that can satisfy hugepage allocations
(policy_zone?) (a subset of the populated nodes).

That may solve both the memoryless nodes problem and your platform's
problem?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  5:15                     ` William Lee Irwin III
@ 2007-06-12 17:36                       ` Nishanth Aravamudan
  2007-06-12 18:50                         ` Christoph Lameter
  2007-06-12 17:45                       ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 17:36 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> On Mon, Jun 11, 2007 at 10:09:10PM -0700, Nishanth Aravamudan wrote:
> > Well, (presuming I understood everything you wrote :), don't we need the
> > static 'affair' to guarantee the initial allocations are approximately
> > round-robin? Or, if we aren't going to make that guarantee, than we
> > should only change that once my sysfs allocator (or its equivalent) is
> > available?
> > Just trying to get a handle on what you're suggesting without any
> > historical context.
> 
> For initially filling the pool one can just loop over nid's modulo the
> number of populated nodes and pass down a stack-allocated variable.

Ok, I'll play with that a bit.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  3:48                                   ` Christoph Lameter
  2007-06-12  5:07                                     ` Nishanth Aravamudan
@ 2007-06-12 17:43                                     ` Nishanth Aravamudan
  2007-06-12 18:49                                       ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 17:43 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm

On 11.06.2007 [20:48:08 -0700], Christoph Lameter wrote:
> On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > Export a function for the interleave functionality so that we do not
> > > have to replicate the same thing in various locations in the kernel.
> > 
> > But I don't understand this at all.
> > 
> > This is *not* generically available, unless every caller has its own
> > private static variable. I don't know how to do that in C.
> 
> It is already there. Each task has a il_next field in its task struct
> for that purpose.

Ok, I see that. And it represent the next node to use for an interleaved
allocation. Makes sense to me, and I see how it's used in mempolicy.c to
achieve that. But we're running at system boot time, or whenever some
invokes the sysctl /proc/sys/vm/nr_hugepages. Do we really want to muck
with some arbitray bash shell's il_next field to achieve interleaving?
What if it's a C process that is trying to achieve actual interleaving
for other purposes and also allocates some hugepages on the system? It
seems like il_next is very much a process-related field.

When I wrote "caller", I meant calling function, sorry, not calling
process.

I'm not entirely sure how il_next is useful here, sorry.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  5:15                     ` William Lee Irwin III
  2007-06-12 17:36                       ` Nishanth Aravamudan
@ 2007-06-12 17:45                       ` Nishanth Aravamudan
  2007-06-12 19:13                         ` William Lee Irwin III
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 17:45 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> On Mon, Jun 11, 2007 at 10:09:10PM -0700, Nishanth Aravamudan wrote:
> > Well, (presuming I understood everything you wrote :), don't we need the
> > static 'affair' to guarantee the initial allocations are approximately
> > round-robin? Or, if we aren't going to make that guarantee, than we
> > should only change that once my sysfs allocator (or its equivalent) is
> > available?
> > Just trying to get a handle on what you're suggesting without any
> > historical context.
> 
> For initially filling the pool one can just loop over nid's modulo the
> number of populated nodes and pass down a stack-allocated variable.

But how does one differentiate between "initally filling" the pool and a
later attempt to add to the pool (or even just marginally later).

I guess I don't see why folks are so against this static variable :) It
does the job and removing it seems like it could be an independent
cleanup?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12 17:35     ` Nishanth Aravamudan
@ 2007-06-12 18:39       ` Christoph Lameter
  2007-06-12 18:54         ` Lee Schermerhorn
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:39 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Lee Schermerhorn, anton, akpm, linux-mm, Andi Kleen

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> > Mea culpa.  Our platforms have a [pseudo-]node with just O(1G) memory
> > all in zone DMA.  That node can't look populated for allocating huge
> > pages.
> 
> Because you don't want to use up any of the DMA pages, right? That seems
> *very* platform specific. And it doesn't seem right to make common code
> more complicated for one platform. Maybe there isn't a better solution,
> but I'd like to mull it over.

Right. Please Lee be generic and avoid the exceptional cases.

> > Maybe we can just exclude zone DMA from the populated map?
> 
> Maybe I don't know enough about NUMA and such, but I'm not sure I
> understand how this would make it a populated map anymore?
> 
> Maybe we need two maps, really?

No need. If you want to exclude a node from huge pages then you need 
to use the patch that allows per node huge page specifications and set 
the number of huge pages for that node to zero.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 15:06                             ` Lee Schermerhorn
  2007-06-12 17:28                               ` Nishanth Aravamudan
@ 2007-06-12 18:41                               ` Christoph Lameter
  2007-06-12 19:07                                 ` Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:41 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> 		page = alloc_pages_node(nid,
>                                GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
>                                HUGETLB_PAGE_ORDER);
> 
> I need to get a page that is on nid.  On our platform, GFP_HIGHUSER is
> going to specify the zonelist for ZONE_NORMAL.  The first zone on this
> list needs to be on-node for nid.  With the changes you've made to the
> definition of populated map, I think this won't be the case.  I need to
> test your latest patches and fix that, if it's broken.

Yes that is the intend of the fixes.

> I still think using policy zone is the "right way" to go, here.  After
> all, only pages in the policy zone are controlled by policy, and that's
> the goal of spreading out the huge pages across nodes--to make them
> available to satisfy memory policy at allocation time.  But that would
> need some adjustments for x86_64 systems that have some nodes that are
> all/mostly DMA32 and other nodes that are populated in zones > DMA32, if
> we want to allocate huge pages out of the DMA32 zone.   

GFP_THISNODE will work right for that case if we get the intended fix in.

> 
> As far as the static variable, and round-robin allocation:  the current
> method "works" both for huge pages allocated at boot time and for huge
> pages allocated at run-time vi the vm.nr_hugepages sysctl.  By "works",
> I mean that it continues to spread the pages evenly across the
> "populated" nodes.  If, however, you use the task local counter to
> interleave fresh huge pages, each write to the nr_hugepages from a
> different task ["echo NN >.../nr_hugepages"] will start at node zero or
> the first populated node--assuming you're interleaving across populated
> nodes and not on-line nodes.  That's probably OK if you always change

We may want to change that behavior. Interleave should start at the local 
node and then proceed from there. If there are just a few pages needed 
then they would be better placed local to the process.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 17:28                               ` Nishanth Aravamudan
@ 2007-06-12 18:43                                 ` Christoph Lameter
  2007-06-12 18:48                                 ` Lee Schermerhorn
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:43 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> And yes, I think the per-node attribute will fix most of the interface
> problems for 'odd' NUMA systems.

Right. The definition of populate_node needs to be clear. We agreed that 
it is a node with memory. If we want to add further restrictions on the 
type of memory being used on the node then we can add further maps if 
necessary or have the subsystem manage a nodelist of allowed nodes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 14:19       ` [PATCH v2] Add populated_map to account for " Lee Schermerhorn
  2007-06-12 17:32         ` Nishanth Aravamudan
@ 2007-06-12 18:45         ` Christoph Lameter
  2007-06-12 19:17           ` Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:45 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> > Could be much simpler:
> > 
> > if (pgdat->node_present_pages)
> > 	node_set_populated(local_node);
> 
> As a minimum, we need to exclude a node with only zone DMA memory for
> this to work on our platforms.  For that, I think the current code is
> the simplest because we still need to check if the first zone is
> "on-node" and !DMA.

You are changing the definition of populated node.

> And, I think we need both cases--set and reset populated map bit--to
> handle memory/node hotplug.  So something like:

Yes memory unplug will need to clear the bit if a complete node is
cleared. But we do not support node unplug yet. So it is okay for now and 
it is doubtful that the build_zonelist function is going to be called for 
the node that is being removed.

> Need to define 'is_zone-dma()' to test the zone or unconditionally
> return false depending on whether ZONE_DMA is configured.

CONFIG_ZONE_DMA already exists.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12  5:07                                     ` Nishanth Aravamudan
@ 2007-06-12 18:47                                       ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:47 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm

On Mon, 11 Jun 2007, Nishanth Aravamudan wrote:

> Hrm, maybe that will work -- but then it means that if one is
> interleaving huge pages, it will interfere with the interleaving of
> small pages. Given that right now, huge pages are a rather precious
> commodity, do we want this?

The number of pages interleaved for small pages is quite high. So there
will not be a significant effect. If we use this counter then we can
fall back on existing functionality in the memory policy subsystem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 17:28                               ` Nishanth Aravamudan
  2007-06-12 18:43                                 ` Christoph Lameter
@ 2007-06-12 18:48                                 ` Lee Schermerhorn
  2007-06-12 18:51                                   ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 18:48 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Christoph Lameter, anton, akpm, linux-mm

On Tue, 2007-06-12 at 10:28 -0700, Nishanth Aravamudan wrote:
> On 12.06.2007 [11:06:22 -0400], Lee Schermerhorn wrote:
<snip>
> > 
> > Nish:  when this all settles down, I still need to make sure it works
> > on our platforms with the funny DMA-only node.  What that comes down
> > to is that when alloc_fresh_huge_page() calls:
> 
> Ok, thanks for these details.
> 
> Would you be ok with stabilizing the generic definition of
> node_populated_map as is (any present pages, regardless of location),
> and then trying to figure out how to get your platform to work with
> that?

Yeah, I think that's my only option now that node_populated_map is being
used for other things than huge page setup.

> 
> > 		page = alloc_pages_node(nid,
> >                                GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
> >                                HUGETLB_PAGE_ORDER);
> > 
> > I need to get a page that is on nid.  On our platform, GFP_HIGHUSER is
> > going to specify the zonelist for ZONE_NORMAL.  The first zone on this
> > list needs to be on-node for nid.  With the changes you've made to the
> > definition of populated map, I think this won't be the case.  I need
> > to test your latest patches and fix that, if it's broken.
> 
> Ok. But that means your platform is broken now too, right? As in, it's
> not a regression, per se?

Well, my patch [v4] fixed it on my platform.  So this is a regression
relative to my patch.  But, then, my patch had an issue with an x86_64
system where one node is all/mostly DMA32 and other nodes have memory in
higher zones.  Maybe that's OK [or not] for hugepage allocation, but
almost certainly not for regular page interleaving, ...

> 
> I'm much more concerned in the short term about the whole
> memoryless-node issue, which I think is more straight-forward, and
> generic to fix.

Perhaps, but I think we're still going to get off node allocations with
the revised definition of the populated map and the new zonelist
ordering.  I think we'll need to check for and reject off-node
allocations when '_THISNODE is specified.  We can't assume that the
first zone in a node's zonelist for a given gfp_zone is on-node.

[more in response to other mail...]

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12 17:43                                     ` Nishanth Aravamudan
@ 2007-06-12 18:49                                       ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:49 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: linux-mm

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> Ok, I see that. And it represent the next node to use for an interleaved
> allocation. Makes sense to me, and I see how it's used in mempolicy.c to
> achieve that. But we're running at system boot time, or whenever some

At boot time the init_task is running and you can effectively use a global
variable like you have now.

> invokes the sysctl /proc/sys/vm/nr_hugepages. Do we really want to muck
> with some arbitray bash shell's il_next field to achieve interleaving?
> What if it's a C process that is trying to achieve actual interleaving
> for other purposes and also allocates some hugepages on the system? It
> seems like il_next is very much a process-related field.

il_next is process related. Mucking around is what is was put there for.
The bash process wont be hurt by changing its il_next field.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12 17:36                       ` Nishanth Aravamudan
@ 2007-06-12 18:50                         ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:50 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: William Lee Irwin III, lee.schermerhorn, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> > For initially filling the pool one can just loop over nid's modulo the
> > number of populated nodes and pass down a stack-allocated variable.
> 
> Ok, I'll play with that a bit.

That would work too but then you need to write your own interleave 
function.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 18:48                                 ` Lee Schermerhorn
@ 2007-06-12 18:51                                   ` Christoph Lameter
  2007-06-12 19:44                                     ` Lee Schermerhorn
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 18:51 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> Well, my patch [v4] fixed it on my platform.  So this is a regression
> relative to my patch.  But, then, my patch had an issue with an x86_64
> system where one node is all/mostly DMA32 and other nodes have memory in
> higher zones.  Maybe that's OK [or not] for hugepage allocation, but
> almost certainly not for regular page interleaving, ...

Well this means your patch was arch specific.

> > I'm much more concerned in the short term about the whole
> > memoryless-node issue, which I think is more straight-forward, and
> > generic to fix.
> 
> Perhaps, but I think we're still going to get off node allocations with
> the revised definition of the populated map and the new zonelist
> ordering.  I think we'll need to check for and reject off-node
> allocations when '_THISNODE is specified.  We can't assume that the
> first zone in a node's zonelist for a given gfp_zone is on-node.

We do not do that anymore. GFP_THISNODE guarantees the allocation on 
the node with alloc_pages_node. Read on.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12 18:39       ` Christoph Lameter
@ 2007-06-12 18:54         ` Lee Schermerhorn
  2007-06-12 19:00           ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 18:54 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Andi Kleen

On Tue, 2007-06-12 at 11:39 -0700, Christoph Lameter wrote:
> On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > Mea culpa.  Our platforms have a [pseudo-]node with just O(1G) memory
> > > all in zone DMA.  That node can't look populated for allocating huge
> > > pages.
> > 
> > Because you don't want to use up any of the DMA pages, right? That seems
> > *very* platform specific. And it doesn't seem right to make common code
> > more complicated for one platform. Maybe there isn't a better solution,
> > but I'd like to mull it over.
> 
> Right. Please Lee be generic and avoid the exceptional cases.

I was trying to be generic.  But it broke for the exceptional case of an
x86_64 with all/mostly DMA32 in one node and higher zone memory in other
nodes.  

> 
> > > Maybe we can just exclude zone DMA from the populated map?
> > 
> > Maybe I don't know enough about NUMA and such, but I'm not sure I
> > understand how this would make it a populated map anymore?
> > 
> > Maybe we need two maps, really?
> 
> No need. If you want to exclude a node from huge pages then you need 
> to use the patch that allows per node huge page specifications and set 
> the number of huge pages for that node to zero.


Perhaps.  But, be aware that allocating pages via the 'hugepages' boot
parameter or the vm.nr_hugepages sysctl won't spread pages evenly--on
our platforms, anyway--if we don't get this right.  From what I've seen
in the mailing lists, this approach [fixing it up with the per node
attributes] runs counter to the general approach of having the kernel
figure it out.  

So, I'll wait for this to settle down.  Then I'll see how it works on
our platforms and propose whatever generic fixes I can to make it work.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] Add populated_map to account for memoryless nodes
  2007-06-12 18:54         ` Lee Schermerhorn
@ 2007-06-12 19:00           ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:00 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Andi Kleen

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> Perhaps.  But, be aware that allocating pages via the 'hugepages' boot
> parameter or the vm.nr_hugepages sysctl won't spread pages evenly--on
> our platforms, anyway--if we don't get this right.  From what I've seen
> in the mailing lists, this approach [fixing it up with the per node
> attributes] runs counter to the general approach of having the kernel
> figure it out.  

Hmm.. shmem does the same with the boot parameter there?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 18:41                               ` Christoph Lameter
@ 2007-06-12 19:07                                 ` Lee Schermerhorn
  2007-06-12 19:13                                   ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 19:07 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 2007-06-12 at 11:41 -0700, Christoph Lameter wrote:
> On Tue, 12 Jun 2007, Lee Schermerhorn wrote:
> 
> > 		page = alloc_pages_node(nid,
> >                                GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
> >                                HUGETLB_PAGE_ORDER);
> > 
> > I need to get a page that is on nid.  On our platform, GFP_HIGHUSER is
> > going to specify the zonelist for ZONE_NORMAL.  The first zone on this
> > list needs to be on-node for nid.  With the changes you've made to the
> > definition of populated map, I think this won't be the case.  I need to
> > test your latest patches and fix that, if it's broken.
> 
> Yes that is the intend of the fixes.
> 
> > I still think using policy zone is the "right way" to go, here.  After
> > all, only pages in the policy zone are controlled by policy, and that's
> > the goal of spreading out the huge pages across nodes--to make them
> > available to satisfy memory policy at allocation time.  But that would
> > need some adjustments for x86_64 systems that have some nodes that are
> > all/mostly DMA32 and other nodes that are populated in zones > DMA32, if
> > we want to allocate huge pages out of the DMA32 zone.   
> 
> GFP_THISNODE will work right for that case if we get the intended fix in.

OK.  So, allocations with 'THISNODE will ensure that we don't get an
off-node page if the any zone in the zonelist indicated by the gfp zone
happens to point off-node?  That wasn't the case previously, and that's
why I created the populated map with the semantics I did.  I agree it's
better to have alloc_page_*() handle this.

> 
> > 
> > As far as the static variable, and round-robin allocation:  the current
> > method "works" both for huge pages allocated at boot time and for huge
> > pages allocated at run-time vi the vm.nr_hugepages sysctl.  By "works",
> > I mean that it continues to spread the pages evenly across the
> > "populated" nodes.  If, however, you use the task local counter to
> > interleave fresh huge pages, each write to the nr_hugepages from a
> > different task ["echo NN >.../nr_hugepages"] will start at node zero or
> > the first populated node--assuming you're interleaving across populated
> > nodes and not on-line nodes.  That's probably OK if you always change
> 
> We may want to change that behavior. Interleave should start at the local 
> node and then proceed from there. If there are just a few pages needed 
> then they would be better placed local to the process.

For page interleaving for some memory object, I agree.  The usage here
was for the allocation of reserved huge pages--trying to spread those
evenly across nodes with appropriate [non-DMA] memory.   Then, normal
interleaving will work as huge pages are allocated from the per node
reserved lists until some node's huge pages are exhausted.   

I think that using a local "cursor", as you propose, will work,
tho'--even for spreading huge page allocations for the reserved lists.
We may tend to favor low order nodes if one incrementally increases
nr_hugepages via the sysctl.  But, I don't think that's too regular an
occurrence.  I'm not sure Nish can use the mempolicy huge page
interleaving allocator, tho'  That allocates FROM the per node reserved
lists, and alloc_fresh_huge_page[_node]() is used to fill those lists.  

Lee



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 19:07                                 ` Lee Schermerhorn
@ 2007-06-12 19:13                                   ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:13 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> I think that using a local "cursor", as you propose, will work,
> tho'--even for spreading huge page allocations for the reserved lists.
> We may tend to favor low order nodes if one incrementally increases
> nr_hugepages via the sysctl.  But, I don't think that's too regular an
> occurrence.  I'm not sure Nish can use the mempolicy huge page
> interleaving allocator, tho'  That allocates FROM the per node reserved
> lists, and alloc_fresh_huge_page[_node]() is used to fill those lists.  

Yeah one would need to put some thought into it to have the logic in one 
place so that future maintenance will be easier.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12 17:45                       ` Nishanth Aravamudan
@ 2007-06-12 19:13                         ` William Lee Irwin III
  2007-06-13  0:04                           ` [PATCH v7][RFC] " Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: William Lee Irwin III @ 2007-06-12 19:13 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
>> For initially filling the pool one can just loop over nid's modulo the
>> number of populated nodes and pass down a stack-allocated variable.

On Tue, Jun 12, 2007 at 10:45:03AM -0700, Nishanth Aravamudan wrote:
> But how does one differentiate between "initally filling" the pool and a
> later attempt to add to the pool (or even just marginally later).
> I guess I don't see why folks are so against this static variable :) It
> does the job and removing it seems like it could be an independent
> cleanup?

Well, another approach is to just statically initialize it to something
and then always check to make sure the node for the nid has memory, and
if not, find the next nid with a node with memory from the populated map.


-- wli

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 18:45         ` Christoph Lameter
@ 2007-06-12 19:17           ` Lee Schermerhorn
  2007-06-12 19:22             ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 19:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 2007-06-12 at 11:45 -0700, Christoph Lameter wrote:
> On Tue, 12 Jun 2007, Lee Schermerhorn wrote:
> 
> > > Could be much simpler:
> > > 
> > > if (pgdat->node_present_pages)
> > > 	node_set_populated(local_node);
> > 
> > As a minimum, we need to exclude a node with only zone DMA memory for
> > this to work on our platforms.  For that, I think the current code is
> > the simplest because we still need to check if the first zone is
> > "on-node" and !DMA.
> 
> You are changing the definition of populated node.

Well, I initially created the populated node map to mean nodes that
contained memory at "policy zone"--specifically for use by the huge page
allocator.  I did this because you and others didn't want the hugetlb
code to know about the innards of zonelists, etc.  Made sense, so I came
up with a definition that worked for the platforms I tried it on.
However, as we've discussed here, it would prevent allocation of
hugepages on a DMA32-only x86_64 node if any other node had higher order
memory.  

Now, Nish is proposing to use the populated map to filter policy-based
interleaved allocations.  My definition of populated map won't work for
that.  So, YOU are the one changing the definition.  I'm OK with that if
it solves a more generic problem.  My patch hadn't gone in anyway.

> > And, I think we need both cases--set and reset populated map bit--to
> > handle memory/node hotplug.  So something like:
> 
> Yes memory unplug will need to clear the bit if a complete node is
> cleared. But we do not support node unplug yet. So it is okay for now and 
> it is doubtful that the build_zonelist function is going to be called for 
> the node that is being removed.
> 
> > Need to define 'is_zone-dma()' to test the zone or unconditionally
> > return false depending on whether ZONE_DMA is configured.
> 
> CONFIG_ZONE_DMA already exists.

Yes, but I didn't want to stick #ifdefs in the functions if I didn't
have to.  But, it's a moot point.  After looking at it more, I've
decided there may be no definition of populated map that works reliably
for huge page allocation on all of the platform configurations.
However, if GFP_THISNODE guarantees no off-node allocations, that may do
the trick.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:17           ` Lee Schermerhorn
@ 2007-06-12 19:22             ` Christoph Lameter
  2007-06-12 19:49               ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:22 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Nishanth Aravamudan, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> On Tue, 2007-06-12 at 11:45 -0700, Christoph Lameter wrote:
> Now, Nish is proposing to use the populated map to filter policy-based
> interleaved allocations.  My definition of populated map won't work for
> that.  So, YOU are the one changing the definition.  I'm OK with that if
> it solves a more generic problem.  My patch hadn't gone in anyway.

Ok. So how about renaming the populated_map to

node_memory_map

so that its clear that this is a map of node with memory?

GFP_THISNODE needs this map to fail on memoryless nodes.

> Yes, but I didn't want to stick #ifdefs in the functions if I didn't
> have to.  But, it's a moot point.  After looking at it more, I've
> decided there may be no definition of populated map that works reliably
> for huge page allocation on all of the platform configurations.
> However, if GFP_THISNODE guarantees no off-node allocations, that may do
> the trick.

It can do that if the populated map works the right way.... circle is 
closing ... I can sent out a patchset in a few minutes that fixes the 
GFP_THISNODE issue and introduces node_memory_map.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 18:51                                   ` Christoph Lameter
@ 2007-06-12 19:44                                     ` Lee Schermerhorn
  2007-06-12 19:48                                       ` Christoph Lameter
  2007-06-12 19:55                                       ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-12 19:44 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 2007-06-12 at 11:51 -0700, Christoph Lameter wrote:
> On Tue, 12 Jun 2007, Lee Schermerhorn wrote:
> 
> > Well, my patch [v4] fixed it on my platform.  So this is a regression
> > relative to my patch.  But, then, my patch had an issue with an x86_64
> > system where one node is all/mostly DMA32 and other nodes have memory in
> > higher zones.  Maybe that's OK [or not] for hugepage allocation, but
> > almost certainly not for regular page interleaving, ...
> 
> Well this means your patch was arch specific.

Worse than that--the problem is platform specific.  I thought the patch
was generic--that's what I was striving for.  I just hadn't thought
through the implications for x86_64 platforms with just the right amount
of memory to cause a problem.

I tested on a 2 socket, 4GB blade.  All memory, both nodes, was DMA32 or
lower, so policy_zone == ZONE_DMA32 and it worked fine.  I tested on a 4
socket, 32GB server--8GB per node.  Policy_zone was ZONE_NORMAL, but all
nodes had at least 4G of normal memory.  For the nr_hugepages that I
tried, I saw the pages allocated evenly across the nodes.  Guess I
didn't ask for enough pages to consume all of the normal memory on node
0 to see any imbalance thereafter.

> 
> > > I'm much more concerned in the short term about the whole
> > > memoryless-node issue, which I think is more straight-forward, and
> > > generic to fix.
> > 
> > Perhaps, but I think we're still going to get off node allocations with
> > the revised definition of the populated map and the new zonelist
> > ordering.  I think we'll need to check for and reject off-node
> > allocations when '_THISNODE is specified.  We can't assume that the
> > first zone in a node's zonelist for a given gfp_zone is on-node.
> 
> We do not do that anymore. GFP_THISNODE guarantees the allocation on 
> the node with alloc_pages_node. Read on.

I have been reading.  Might work as you say.  Not because you're testing
the populated map in alloc_pages_node().  That can still pass an
off-node zonelist to __alloc_pages().  However, I'm hoping that the test
of the zone_pgdat in get_page_from_freelist() will do the right thing.
I'm referring to:

	if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
	    zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
		break;

But, I'm not convinced that zonelist->zones[0]->zone_pgdat always refers
to the node specified the 'nid' argument of alloc_pages_node().  It was
with my definition of the populated map, but I don't think so, now.

We'll see.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 19:44                                     ` Lee Schermerhorn
@ 2007-06-12 19:48                                       ` Christoph Lameter
  2007-06-12 19:58                                         ` Christoph Lameter
  2007-06-12 19:55                                       ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:48 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Lee Schermerhorn wrote:

> I have been reading.  Might work as you say.  Not because you're testing
> the populated map in alloc_pages_node().  That can still pass an
> off-node zonelist to __alloc_pages().  However, I'm hoping that the test
> of the zone_pgdat in get_page_from_freelist() will do the right thing.
> I'm referring to:
> 
>                 
> 	if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
> 	    zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
> 		break;
> 
> But, I'm not convinced that zonelist->zones[0]->zone_pgdat always refers
> to the node specified the 'nid' argument of alloc_pages_node().  It was
> with my definition of the populated map, but I don't think so, now.

It does refer to the current node if the node has memory on its own. 
alloc_pages_node pickup the zonelist of the node. If the node has memory 
then the first zone will be the nodes zones.

Uhhh... Right there is another special case. The recently 
introduces zonelist swizzle makes the DMA zone come last and if a 
node had only a DMA zone then it may become swizzled to the end of 
the zonelist.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:22             ` Christoph Lameter
@ 2007-06-12 19:49               ` Nishanth Aravamudan
  2007-06-12 19:51                 ` Christoph Lameter
  2007-06-12 19:52                 ` Christoph Lameter
  0 siblings, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 19:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On 12.06.2007 [12:22:37 -0700], Christoph Lameter wrote:
> On Tue, 12 Jun 2007, Lee Schermerhorn wrote:
> 
> > On Tue, 2007-06-12 at 11:45 -0700, Christoph Lameter wrote:
> > Now, Nish is proposing to use the populated map to filter policy-based
> > interleaved allocations.  My definition of populated map won't work for
> > that.  So, YOU are the one changing the definition.  I'm OK with that if
> > it solves a more generic problem.  My patch hadn't gone in anyway.
> 
> Ok. So how about renaming the populated_map to
> 
> node_memory_map
> 
> so that its clear that this is a map of node with memory?
> 
> GFP_THISNODE needs this map to fail on memoryless nodes.
> 
> > Yes, but I didn't want to stick #ifdefs in the functions if I didn't
> > have to.  But, it's a moot point.  After looking at it more, I've
> > decided there may be no definition of populated map that works reliably
> > for huge page allocation on all of the platform configurations.
> > However, if GFP_THISNODE guarantees no off-node allocations, that may do
> > the trick.
> 
> It can do that if the populated map works the right way.... circle is 
> closing ... I can sent out a patchset in a few minutes that fixes the 
> GFP_THISNODE issue and introduces node_memory_map.

Something like the following (need to s/populated/memory/ as
approparitely, still... so not s-o-b...

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 49dcc2f..453cc32 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -175,6 +175,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 	if (nid < 0)
 		nid = numa_node_id();
 
+	if ((gfp_mask & __GFP_THISNODE) && !node_is_populated(nid))
+		return NULL;
+
 	return __alloc_pages(gfp_mask, order,
 		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
 }
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 52c54a5..4fb054a 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -70,6 +70,10 @@
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_populated(node)		set bit 'node' in node_populated_map
+ * node_set_unpopulated(node)		clear bit 'node' in node_populated_map
+ * int node_is_populated(node)		Does some node have pages_present != 0?
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -353,6 +357,10 @@ extern nodemask_t node_possible_map;
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
+extern nodemask_t node_populated_map;
+#define node_set_populated(node)	set_bit((node), node_populated_map.bits)
+#define node_set_unpopulated(node)	clear_bit((node), node_populated_map.bits)
+#define node_is_populated(node)		node_isset((node), node_populated_map)
 #else
 #define num_online_nodes()	1
 #define num_possible_nodes()	1
@@ -361,6 +369,9 @@ extern int nr_node_ids;
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
+#define node_set_populated(node)	do { } while (0)
+#define node_set_unpopulated(node)	do { } while (0)
+#define node_is_populated(nid)	((node) == 0)
 #endif
 
 #define any_online_node(mask)			\
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07cd5ae..fab163d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,8 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_populated_map __read_mostly = { { [0] = 1UL } };
+EXPORT_SYMBOL(node_populated_map);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2299,6 +2301,13 @@ static void build_zonelists(pg_data_t *pgdat)
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
+
+	/*
+	 * Node and Memory Hot-Unplug will need to invoke
+	 * node_set_unpopulated if a node is made to be memory-less
+	 */
+	if (pgdat->node_present_pages)
+		node_set_populated(local_node);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:49               ` Nishanth Aravamudan
@ 2007-06-12 19:51                 ` Christoph Lameter
  2007-06-12 20:00                   ` Nishanth Aravamudan
  2007-06-12 19:52                 ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:51 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

I thought more along these lines?

NUMA: introduce node_memory_map

It is necessary to know if nodes have memory since we have recently
begun to add support for memoryless nodes. For that purpose we introduce
a new bitmap called

node_memory_map

A node has its bit in node_memory_map set if it has memory. If a node
has memory then it has at least one zone defined in its pgdat structure
that is located in the pgdat itself.

The node_memory_map can then be used in various places to insure that we
do the right thing when we encounter a memoryless node.

Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.22-rc4-mm2/include/linux/nodemask.h
===================================================================
--- linux-2.6.22-rc4-mm2.orig/include/linux/nodemask.h	2007-06-12 12:07:29.000000000 -0700
+++ linux-2.6.22-rc4-mm2/include/linux/nodemask.h	2007-06-12 12:09:35.000000000 -0700
@@ -64,12 +64,16 @@
  *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
+ * int node_memory(node)		Does a node have memory?
  *
  * int any_online_node(mask)		First online node in mask
  *
  * node_set_online(node)		set bit 'node' in node_online_map
  * node_set_offline(node)		clear bit 'node' in node_online_map
  *
+ * node_set_memory(node)		set bit 'node' in node_memory_map
+ * node_clear_memoryd(node)		clear bit 'node' in node_memory_map
+ *
  * for_each_node(node)			for-loop node over node_possible_map
  * for_each_online_node(node)		for-loop node over node_online_map
  *
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemas
 
 extern nodemask_t node_online_map;
 extern nodemask_t node_possible_map;
+extern nodemask_t node_memory_map;
 
 #if MAX_NUMNODES > 1
 #define num_online_nodes()	nodes_weight(node_online_map)
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define node_memory(node)	node_isset((node), node_memory_map)
 #define first_online_node	first_node(node_online_map)
 #define next_online_node(nid)	next_node((nid), node_online_map)
 extern int nr_node_ids;
@@ -358,6 +364,7 @@ extern int nr_node_ids;
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define node_populated(node)	((node) == 0)
 #define first_online_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1
@@ -375,6 +382,9 @@ extern int nr_node_ids;
 #define node_set_online(node)	   set_bit((node), node_online_map.bits)
 #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
 
+#define node_set_memory(node)     set_bit((node), node_memory_map.bits)
+#define node_clear_memory(node)   clear_bit((node), node_memory_map.bits)
+
 #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
 #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
 
Index: linux-2.6.22-rc4-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/page_alloc.c	2007-06-12 12:07:29.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/page_alloc.c	2007-06-12 12:11:04.000000000 -0700
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_memory_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_memory_map);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
@@ -2299,6 +2302,9 @@ static void build_zonelists(pg_data_t *p
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
+
+	if (pgdat->node_present_pages)
+		node_set_memory(local_node);
 }
 
 /* Construct the zonelist performance cache - see further mmzone.h */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:49               ` Nishanth Aravamudan
  2007-06-12 19:51                 ` Christoph Lameter
@ 2007-06-12 19:52                 ` Christoph Lameter
  2007-06-12 19:58                   ` Christoph Lameter
  2007-06-12 20:00                   ` Nishanth Aravamudan
  1 sibling, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:52 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

Interleave fix patch:

Fix MPOL_INTERLEAVE behavior for memoryless nodes

MPOL_INTERLEAVE currently simply loops over all nodes. Allocations on
memoryless nodes will be redirected to nodes with memory. This results in
an imbalance because the neighboring nodes to memoryless nodes will get significantly
more interleave hits that the rest of the nodes on the system.

We can avoid this imbalance by clearing the nodes in the interleave node
set that have no memory.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/mm/mempolicy.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c	2007-06-12 12:37:23.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/mempolicy.c	2007-06-12 12:39:16.000000000 -0700
@@ -185,6 +185,7 @@ static struct mempolicy *mpol_new(int mo
 	switch (mode) {
 	case MPOL_INTERLEAVE:
 		policy->v.nodes = *nodes;
+		nodemask_and(policy->v.nodes, policy->v.nodes, node_memory_map);
 		if (nodes_weight(*nodes) == 0) {
 			kmem_cache_free(policy_cache, policy);
 			return ERR_PTR(-EINVAL);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 19:44                                     ` Lee Schermerhorn
  2007-06-12 19:48                                       ` Christoph Lameter
@ 2007-06-12 19:55                                       ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 19:55 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm

On 12.06.2007 [15:44:33 -0400], Lee Schermerhorn wrote:
> On Tue, 2007-06-12 at 11:51 -0700, Christoph Lameter wrote:
> > On Tue, 12 Jun 2007, Lee Schermerhorn wrote:
> > 
> > > Well, my patch [v4] fixed it on my platform.  So this is a regression
> > > relative to my patch.  But, then, my patch had an issue with an x86_64
> > > system where one node is all/mostly DMA32 and other nodes have memory in
> > > higher zones.  Maybe that's OK [or not] for hugepage allocation, but
> > > almost certainly not for regular page interleaving, ...
> > 
> > Well this means your patch was arch specific.
> 
> Worse than that--the problem is platform specific.  I thought the
> patch was generic--that's what I was striving for.  I just hadn't
> thought through the implications for x86_64 platforms with just the
> right amount of memory to cause a problem.
> 
> I tested on a 2 socket, 4GB blade.  All memory, both nodes, was DMA32
> or lower, so policy_zone == ZONE_DMA32 and it worked fine.  I tested
> on a 4 socket, 32GB server--8GB per node.  Policy_zone was
> ZONE_NORMAL, but all nodes had at least 4G of normal memory.  For the
> nr_hugepages that I tried, I saw the pages allocated evenly across the
> nodes.  Guess I didn't ask for enough pages to consume all of the
> normal memory on node 0 to see any imbalance thereafter.

Yeah, it's tricky to get testing across all the corner cases, to say the
least. Although between the two of us, we might have enough h/w that
breaks assumptions :)

> > > > I'm much more concerned in the short term about the whole
> > > > memoryless-node issue, which I think is more straight-forward, and
> > > > generic to fix.
> > > 
> > > Perhaps, but I think we're still going to get off node allocations with
> > > the revised definition of the populated map and the new zonelist
> > > ordering.  I think we'll need to check for and reject off-node
> > > allocations when '_THISNODE is specified.  We can't assume that the
> > > first zone in a node's zonelist for a given gfp_zone is on-node.
> > 
> > We do not do that anymore. GFP_THISNODE guarantees the allocation on 
> > the node with alloc_pages_node. Read on.
> 
> I have been reading.  Might work as you say.  Not because you're
> testing the populated map in alloc_pages_node().  That can still pass
> an off-node zonelist to __alloc_pages().  However, I'm hoping that the
> test of the zone_pgdat in get_page_from_freelist() will do the right
> thing.  I'm referring to:
> 
>                 
> 	if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
> 	    zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
> 		break;
> 
> But, I'm not convinced that zonelist->zones[0]->zone_pgdat always
> refers to the node specified the 'nid' argument of alloc_pages_node().
> It was with my definition of the populated map, but I don't think so,
> now.

It doesn't, for sure. That is the problem for memoryless nodes. But, we
bail out of alloc_pages_node if GFP_THISNODE and !node_populated(nid)
now, so we shouldn't hit __alloc_pages (and thus not hit
get_page_from_freelist) in that path. Still trying to get a handle on
the other paths...

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 19:48                                       ` Christoph Lameter
@ 2007-06-12 19:58                                         ` Christoph Lameter
  2007-06-12 20:01                                           ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:58 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Tue, 12 Jun 2007, Christoph Lameter wrote:

> Uhhh... Right there is another special case. The recently 
> introduces zonelist swizzle makes the DMA zone come last and if a 
> node had only a DMA zone then it may become swizzled to the end of 
> the zonelist.

Maybe we can ignore that case for now:


Fix GFP_THISNODE behavior for memoryless nodes

GFP_THISNODE checks that the zone selected is within the pgdat (node) of the
first zone of a nodelist. That only works if the node has memory. A
memoryless node will have its first node on another pgdat (node).

GFP_THISNODE currently will return simply memory on the first pgdat.
Thus it is returning memory on other nodes. GFP_THISNODE should fail
if there is no local memory on a node.

So we add a check to verify that the node specified has memory in
alloc_pages_node(). If the node has no memory then return NULL.

The case of alloc_pages(GFP_THISNODE) is not changed. alloc_pages() (with no memory
policies in effect)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

Index: linux-2.6.22-rc4-mm2/include/linux/gfp.h
===================================================================
--- linux-2.6.22-rc4-mm2.orig/include/linux/gfp.h	2007-06-12 12:33:37.000000000 -0700
+++ linux-2.6.22-rc4-mm2/include/linux/gfp.h	2007-06-12 12:38:37.000000000 -0700
@@ -175,6 +175,13 @@ static inline struct page *alloc_pages_n
 	if (nid < 0)
 		nid = numa_node_id();
 
+	/*
+	 * Check for the special case that GFP_THISNODE is used on a
+	 * memoryless node
+	 */
+	if ((gfp_mask & __GFP_THISNODE) && !node_memory(nid))
+		return NULL;
+
 	return __alloc_pages(gfp_mask, order,
 		NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:52                 ` Christoph Lameter
@ 2007-06-12 19:58                   ` Christoph Lameter
  2007-06-12 20:00                   ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 19:58 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 12 Jun 2007, Christoph Lameter wrote:

> +		nodemask_and(policy->v.nodes, policy->v.nodes, node_memory_map);
	^^^ has to be nodes_and

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:51                 ` Christoph Lameter
@ 2007-06-12 20:00                   ` Nishanth Aravamudan
  2007-06-12 20:03                     ` Christoph Lameter
  2007-06-12 20:10                     ` Christoph Lameter
  0 siblings, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 20:00 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On 12.06.2007 [12:51:26 -0700], Christoph Lameter wrote:
> I thought more along these lines?
> 
> NUMA: introduce node_memory_map
> 
> It is necessary to know if nodes have memory since we have recently
> begun to add support for memoryless nodes. For that purpose we introduce
> a new bitmap called
> 
> node_memory_map
> 
> A node has its bit in node_memory_map set if it has memory. If a node
> has memory then it has at least one zone defined in its pgdat structure
> that is located in the pgdat itself.
> 
> The node_memory_map can then be used in various places to insure that we
> do the right thing when we encounter a memoryless node.
> 
> Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> 
> Index: linux-2.6.22-rc4-mm2/include/linux/nodemask.h
> ===================================================================
> --- linux-2.6.22-rc4-mm2.orig/include/linux/nodemask.h	2007-06-12 12:07:29.000000000 -0700
> +++ linux-2.6.22-rc4-mm2/include/linux/nodemask.h	2007-06-12 12:09:35.000000000 -0700
> @@ -64,12 +64,16 @@
>   *
>   * int node_online(node)		Is some node online?
>   * int node_possible(node)		Is some node possible?
> + * int node_memory(node)		Does a node have memory?
>   *
>   * int any_online_node(mask)		First online node in mask
>   *
>   * node_set_online(node)		set bit 'node' in node_online_map
>   * node_set_offline(node)		clear bit 'node' in node_online_map
>   *
> + * node_set_memory(node)		set bit 'node' in node_memory_map
> + * node_clear_memoryd(node)		clear bit 'node' in node_memory_map
> + *

These are terrible names :) Something more like node_set_has_memory(node)
and node_set_has_no_memory(node), maybe? [why be arbitrarily different
than node_set_{on,off}line?].

And there is a typo ;)

>   * for_each_node(node)			for-loop node over node_possible_map
>   * for_each_online_node(node)		for-loop node over node_online_map
>   *
> @@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemas
> 
>  extern nodemask_t node_online_map;
>  extern nodemask_t node_possible_map;
> +extern nodemask_t node_memory_map;
> 
>  #if MAX_NUMNODES > 1
>  #define num_online_nodes()	nodes_weight(node_online_map)
>  #define num_possible_nodes()	nodes_weight(node_possible_map)
>  #define node_online(node)	node_isset((node), node_online_map)
>  #define node_possible(node)	node_isset((node), node_possible_map)
> +#define node_memory(node)	node_isset((node), node_memory_map)
>  #define first_online_node	first_node(node_online_map)
>  #define next_online_node(nid)	next_node((nid), node_online_map)
>  extern int nr_node_ids;
> @@ -358,6 +364,7 @@ extern int nr_node_ids;
>  #define num_possible_nodes()	1
>  #define node_online(node)	((node) == 0)
>  #define node_possible(node)	((node) == 0)
> +#define node_populated(node)	((node) == 0)
>  #define first_online_node	0
>  #define next_online_node(nid)	(MAX_NUMNODES)
>  #define nr_node_ids		1
> @@ -375,6 +382,9 @@ extern int nr_node_ids;
>  #define node_set_online(node)	   set_bit((node), node_online_map.bits)
>  #define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
> 
> +#define node_set_memory(node)     set_bit((node), node_memory_map.bits)
> +#define node_clear_memory(node)   clear_bit((node), node_memory_map.bits)
> +
>  #define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
>  #define for_each_online_node(node) for_each_node_mask((node), node_online_map)
> 
> Index: linux-2.6.22-rc4-mm2/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.22-rc4-mm2.orig/mm/page_alloc.c	2007-06-12 12:07:29.000000000 -0700
> +++ linux-2.6.22-rc4-mm2/mm/page_alloc.c	2007-06-12 12:11:04.000000000 -0700
> @@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly
>  EXPORT_SYMBOL(node_online_map);
>  nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
>  EXPORT_SYMBOL(node_possible_map);
> +nodemask_t node_memory_map __read_mostly = NODE_MASK_NONE;
> +EXPORT_SYMBOL(node_memory_map);

SERIOUSLY!? After saying that node_populated_map should be NUMA-only
over and over, you made it global here :-P

>  unsigned long totalram_pages __read_mostly;
>  unsigned long totalreserve_pages __read_mostly;
>  long nr_swap_pages;
> @@ -2299,6 +2302,9 @@ static void build_zonelists(pg_data_t *p
>  		/* calculate node order -- i.e., DMA last! */
>  		build_zonelists_in_zone_order(pgdat, j);
>  	}
> +
> +	if (pgdat->node_present_pages)
> +		node_set_memory(local_node);
>  }

Other than the naming, looks sane.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 19:52                 ` Christoph Lameter
  2007-06-12 19:58                   ` Christoph Lameter
@ 2007-06-12 20:00                   ` Nishanth Aravamudan
  2007-06-12 20:06                     ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 20:00 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On 12.06.2007 [12:52:38 -0700], Christoph Lameter wrote:
> Interleave fix patch:
> 
> Fix MPOL_INTERLEAVE behavior for memoryless nodes
> 
> MPOL_INTERLEAVE currently simply loops over all nodes. Allocations on
> memoryless nodes will be redirected to nodes with memory. This results in
> an imbalance because the neighboring nodes to memoryless nodes will get significantly
> more interleave hits that the rest of the nodes on the system.
> 
> We can avoid this imbalance by clearing the nodes in the interleave node
> set that have no memory.
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> 
> Index: linux-2.6.22-rc4-mm2/mm/mempolicy.c
> ===================================================================
> --- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c	2007-06-12 12:37:23.000000000 -0700
> +++ linux-2.6.22-rc4-mm2/mm/mempolicy.c	2007-06-12 12:39:16.000000000 -0700
> @@ -185,6 +185,7 @@ static struct mempolicy *mpol_new(int mo
>  	switch (mode) {
>  	case MPOL_INTERLEAVE:
>  		policy->v.nodes = *nodes;
> +		nodemask_and(policy->v.nodes, policy->v.nodes, node_memory_map);
>  		if (nodes_weight(*nodes) == 0) {

Shouldn't this be changed to

		if (nodes_weight(policy->v.nodes) == 0) {

??

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 19:58                                         ` Christoph Lameter
@ 2007-06-12 20:01                                           ` Nishanth Aravamudan
  2007-06-13 15:30                                             ` Lee Schermerhorn
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-12 20:01 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On 12.06.2007 [12:58:16 -0700], Christoph Lameter wrote:
> On Tue, 12 Jun 2007, Christoph Lameter wrote:
> 
> > Uhhh... Right there is another special case. The recently 
> > introduces zonelist swizzle makes the DMA zone come last and if a 
> > node had only a DMA zone then it may become swizzled to the end of 
> > the zonelist.
> 
> Maybe we can ignore that case for now:
> 
> 
> Fix GFP_THISNODE behavior for memoryless nodes
> 
> GFP_THISNODE checks that the zone selected is within the pgdat (node) of the
> first zone of a nodelist. That only works if the node has memory. A
> memoryless node will have its first node on another pgdat (node).
> 
> GFP_THISNODE currently will return simply memory on the first pgdat.
> Thus it is returning memory on other nodes. GFP_THISNODE should fail
> if there is no local memory on a node.
> 
> So we add a check to verify that the node specified has memory in
> alloc_pages_node(). If the node has no memory then return NULL.
> 
> The case of alloc_pages(GFP_THISNODE) is not changed. alloc_pages() (with no memory
> policies in effect)
> 
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> 
> Index: linux-2.6.22-rc4-mm2/include/linux/gfp.h
> ===================================================================
> --- linux-2.6.22-rc4-mm2.orig/include/linux/gfp.h	2007-06-12 12:33:37.000000000 -0700
> +++ linux-2.6.22-rc4-mm2/include/linux/gfp.h	2007-06-12 12:38:37.000000000 -0700
> @@ -175,6 +175,13 @@ static inline struct page *alloc_pages_n
>  	if (nid < 0)
>  		nid = numa_node_id();
> 
> +	/*
> +	 * Check for the special case that GFP_THISNODE is used on a
> +	 * memoryless node
> +	 */
> +	if ((gfp_mask & __GFP_THISNODE) && !node_memory(nid))
> +		return NULL;
> +

Yep, this seems to be the right thing to do, and was in my rolled-up
patch.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 20:00                   ` Nishanth Aravamudan
@ 2007-06-12 20:03                     ` Christoph Lameter
  2007-06-12 20:10                     ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 20:03 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> SERIOUSLY!? After saying that node_populated_map should be NUMA-only
> over and over, you made it global here :-P

yeah looking at it: The nodemask_t becomes very small so its not worth it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 20:00                   ` Nishanth Aravamudan
@ 2007-06-12 20:06                     ` Christoph Lameter
  0 siblings, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 20:06 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> > ===================================================================
> > --- linux-2.6.22-rc4-mm2.orig/mm/mempolicy.c	2007-06-12 12:37:23.000000000 -0700
> > +++ linux-2.6.22-rc4-mm2/mm/mempolicy.c	2007-06-12 12:39:16.000000000 -0700
> > @@ -185,6 +185,7 @@ static struct mempolicy *mpol_new(int mo
> >  	switch (mode) {
> >  	case MPOL_INTERLEAVE:
> >  		policy->v.nodes = *nodes;
> > +		nodemask_and(policy->v.nodes, policy->v.nodes, node_memory_map);
> >  		if (nodes_weight(*nodes) == 0) {
> 
> Shouldn't this be changed to
> 
> 		if (nodes_weight(policy->v.nodes) == 0) {

You are right. Fix applied. I will post a patchset when I got my testing 
done.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v2] Add populated_map to account for memoryless nodes
  2007-06-12 20:00                   ` Nishanth Aravamudan
  2007-06-12 20:03                     ` Christoph Lameter
@ 2007-06-12 20:10                     ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-12 20:10 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Lee Schermerhorn, anton, akpm, linux-mm, Kamezawa Hiroyuki

On Tue, 12 Jun 2007, Nishanth Aravamudan wrote:

> 
> These are terrible names :) Something more like node_set_has_memory(node)
> and node_set_has_no_memory(node), maybe? [why be arbitrarily different
> than node_set_{on,off}line?].

Because node_set_on/off is already inconsisted with node_set and 
node_clear.

Another possible name may be

node_clear_has_memory
node_clear_has_memory

but that is awkward.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v7][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-12 19:13                         ` William Lee Irwin III
@ 2007-06-13  0:04                           ` Nishanth Aravamudan
  2007-06-13 15:26                             ` [PATCH v3][RFC] hugetlb: numafy several functions Nishanth Aravamudan
  2007-06-13 21:04                             ` [PATCH v7][RFC] Fix hugetlb pool allocation with empty nodes Lee Schermerhorn
  0 siblings, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13  0:04 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 12.06.2007 [12:13:47 -0700], William Lee Irwin III wrote:
> On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> >> For initially filling the pool one can just loop over nid's modulo the
> >> number of populated nodes and pass down a stack-allocated variable.
> 
> On Tue, Jun 12, 2007 at 10:45:03AM -0700, Nishanth Aravamudan wrote:
> > But how does one differentiate between "initally filling" the pool and a
> > later attempt to add to the pool (or even just marginally later).
> > I guess I don't see why folks are so against this static variable :) It
> > does the job and removing it seems like it could be an independent
> > cleanup?
> 
> Well, another approach is to just statically initialize it to something
> and then always check to make sure the node for the nid has memory, and
> if not, find the next nid with a node with memory from the populated map.

How does something like this look? Or is it overkill?

[PATCH 2.6.22-rc4-mm2] Fix hugetlb pool allocation with empty nodes V7

Anton found a problem with the hugetlb pool allocation when some nodes
have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee
worked on versions that tried to fix it, but none were accepted.
Christoph has created a set of patches which allow for GFP_THISNODE
allocations to fail if the node has no memory and for exporting a
node_memory_map indicating which nodes have memory. Since mempolicy.c
already has a number of functions which support interleaving, create a
mempolicy when we invoke alloc_fresh_huge_page() that specifies
interleaving across all the nodes in node_memory_map, rather than custom
interleaving code in hugetlb.c.  This requires adding some dummy
functions, and some declarations, in mempolicy.h to compile with NUMA or
!NUMA.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Andrew Morton <akpm@linux-foundation.org>

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 22b668c..c8a68b8 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -76,6 +76,8 @@ struct mempolicy {
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
  */
 
+extern struct mempolicy *mpol_new(int mode, nodemask_t *nodes);
+
 extern void __mpol_free(struct mempolicy *pol);
 static inline void mpol_free(struct mempolicy *pol)
 {
@@ -164,6 +166,8 @@ static inline void check_highest_zone(enum zone_type k)
 		policy_zone = k;
 }
 
+extern unsigned interleave_nodes(struct mempolicy *policy);
+
 int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
@@ -179,6 +183,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
 #define mpol_set_vma_default(vma) do {} while(0)
 
+static inline struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+{
+	return NULL;
+}
+
 static inline void mpol_free(struct mempolicy *p)
 {
 }
@@ -267,6 +276,11 @@ static inline int do_migrate_pages(struct mm_struct *mm,
 static inline void check_highest_zone(int k)
 {
 }
+
+static inline unsigned interleave_nodes(struct mempolicy *policy)
+{
+	return 0;
+}
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 858c0b3..1c13687 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -103,15 +103,20 @@ static void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct mempolicy *policy)
 {
-	static int nid = 0;
+	int nid;
 	struct page *page;
-	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
-	nid = next_node(nid, node_online_map);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(node_online_map);
+	int start_nid = interleave_nodes(policy);
+
+	nid = start_nid;
+
+	do {
+		page = alloc_pages_node(nid,
+				htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
+				HUGETLB_PAGE_ORDER);
+		nid = interleave_nodes(policy);
+	} while (!page && nid != start_nid);
 	if (page) {
 		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);
@@ -153,6 +158,7 @@ fail:
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
+	struct mempolicy *pol;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
@@ -160,11 +166,16 @@ static int __init hugetlb_init(void)
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
+	pol = mpol_new(MPOL_INTERLEAVE, &node_memory_map);
+	if (IS_ERR(pol))
+		goto quit;
 	for (i = 0; i < max_huge_pages; ++i) {
-		if (!alloc_fresh_huge_page())
+		if (!alloc_fresh_huge_page(pol))
 			break;
 	}
+	mpol_free(pol);
 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+quit:
 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
 	return 0;
 }
@@ -232,10 +243,16 @@ static inline void try_to_free_low(unsigned long count)
 
 static unsigned long set_max_huge_pages(unsigned long count)
 {
+	struct mempolicy *pol;
+
+	pol = mpol_new(MPOL_INTERLEAVE, &node_memory_map);
+	if (IS_ERR(pol))
+		return nr_huge_pages;
 	while (count > nr_huge_pages) {
-		if (!alloc_fresh_huge_page())
-			return nr_huge_pages;
+		if (!alloc_fresh_huge_page(pol))
+			break;
 	}
+	mpol_free(pol);
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 21458ca..c576d32 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -171,7 +171,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
@@ -1121,7 +1121,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned nid, next;
 	struct task_struct *me = current;

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v3][RFC] hugetlb: numafy several functions
  2007-06-13  0:04                           ` [PATCH v7][RFC] " Nishanth Aravamudan
@ 2007-06-13 15:26                             ` Nishanth Aravamudan
  2007-06-13 15:28                               ` [PATCH v3][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
  2007-06-13 21:04                             ` [PATCH v7][RFC] Fix hugetlb pool allocation with empty nodes Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 15:26 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 12.06.2007 [17:04:46 -0700], Nishanth Aravamudan wrote:
> On 12.06.2007 [12:13:47 -0700], William Lee Irwin III wrote:
> > On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> > >> For initially filling the pool one can just loop over nid's modulo the
> > >> number of populated nodes and pass down a stack-allocated variable.
> > 
> > On Tue, Jun 12, 2007 at 10:45:03AM -0700, Nishanth Aravamudan wrote:
> > > But how does one differentiate between "initally filling" the pool and a
> > > later attempt to add to the pool (or even just marginally later).
> > > I guess I don't see why folks are so against this static variable :) It
> > > does the job and removing it seems like it could be an independent
> > > cleanup?
> > 
> > Well, another approach is to just statically initialize it to something
> > and then always check to make sure the node for the nid has memory, and
> > if not, find the next nid with a node with memory from the populated map.
> 
> How does something like this look? Or is it overkill?

If that patch looks ok, then the other patches (numafy and sysfs) are
relatively unchanged.

commit 041cb3d3c2fd3640aff50e2f701b8b5a670193de
Author: Nishanth Aravamudan <nacc@us.ibm.com>
Date:   Tue Jun 12 17:10:21 2007 -0700

hugetlb: numafy several functions

Add node-parameterized helpers for dequeue_huge_page,
alloc_fresh_huge_page and try_to_free_low. Also have
update_and_free_page() take a nid parameter. This is necessary to add a
per-node sysfs attribute to specify the number of hugepages on that
node.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Anton Blanchard <anton@sambar.org>
Cc: Andrew Morton <akpm@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1c13687..c4a966e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,11 +66,22 @@ static void enqueue_huge_page(struct page *page)
 	free_huge_pages_node[nid]++;
 }
 
+static struct page *dequeue_huge_page_node(int nid)
+{
+	struct page *page;
+
+	page = list_entry(hugepage_freelists[nid].next,
+					  struct page, lru);
+	list_del(&page->lru);
+	free_huge_pages--;
+	free_huge_pages_node[nid]--;
+	return page;
+}
+
 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 				unsigned long address)
 {
 	int nid;
-	struct page *page = NULL;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 						htlb_alloc_mask);
 	struct zone **z;
@@ -82,14 +93,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 			break;
 	}
 
-	if (*z) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, lru);
-		list_del(&page->lru);
-		free_huge_pages--;
-		free_huge_pages_node[nid]--;
-	}
-	return page;
+	if (*z)
+		return dequeue_huge_page_node(nid);
+	return NULL;
 }
 
 static void free_huge_page(struct page *page)
@@ -103,6 +109,25 @@ static void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+	struct page *page;
+
+	page = alloc_pages_node(nid,
+			GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+			HUGETLB_PAGE_ORDER);
+	if (page) {
+		set_compound_page_dtor(page, free_huge_page);
+		spin_lock(&hugetlb_lock);
+		nr_huge_pages++;
+		nr_huge_pages_node[nid]++;
+		spin_unlock(&hugetlb_lock);
+		put_page(page); /* free it into the hugepage allocator */
+	}
+
+	return page;
+}
+
 static int alloc_fresh_huge_page(struct mempolicy *policy)
 {
 	int nid;
@@ -112,20 +137,12 @@ static int alloc_fresh_huge_page(struct mempolicy *policy)
 	nid = start_nid;
 
 	do {
-		page = alloc_pages_node(nid,
-				htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
-				HUGETLB_PAGE_ORDER);
+		page = alloc_fresh_huge_page_node(nid);
 		nid = interleave_nodes(policy);
 	} while (!page && nid != start_nid);
-	if (page) {
-		set_compound_page_dtor(page, free_huge_page);
-		spin_lock(&hugetlb_lock);
-		nr_huge_pages++;
-		nr_huge_pages_node[page_to_nid(page)]++;
-		spin_unlock(&hugetlb_lock);
-		put_page(page); /* free it into the hugepage allocator */
+
+	if (page)
 		return 1;
-	}
 	return 0;
 }
 
@@ -201,11 +218,11 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 }
 
 #ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(int nid, struct page *page)
 {
 	int i;
 	nr_huge_pages--;
-	nr_huge_pages_node[page_to_nid(page)]--;
+	nr_huge_pages_node[nid]--;
 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -217,25 +234,37 @@ static void update_and_free_page(struct page *page)
 }
 
 #ifdef CONFIG_HIGHMEM
+static void try_to_free_low_node(int nid, unsigned long count)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next,
+				&hugepage_freelists[nid], lru) {
+		if (PageHighMem(page))
+			continue;
+		list_del(&page->lru);
+		update_and_free_page(nid, page);
+		free_huge_pages--;
+		free_huge_pages_node[nid]--;
+		if (count >= nr_huge_pages_node[nid])
+			return;
+	}
+}
+
 static void try_to_free_low(unsigned long count)
 {
 	int i;
 
 	for (i = 0; i < MAX_NUMNODES; ++i) {
-		struct page *page, *next;
-		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
-			if (PageHighMem(page))
-				continue;
-			list_del(&page->lru);
-			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[page_to_nid(page)]--;
-			if (count >= nr_huge_pages)
-				return;
-		}
+		try_to_free_low_node(i, count);
+		if (count >= nr_huge_pages)
+			break;
 	}
 }
 #else
+static inline void try_to_free_low_node(int nid, unsigned long count)
+{
+}
 static inline void try_to_free_low(unsigned long count)
 {
 }
@@ -263,7 +292,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
 		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
-		update_and_free_page(page);
+		update_and_free_page(page_to_nid(page), page);
 	}
 	spin_unlock(&hugetlb_lock);
 	return nr_huge_pages;

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v3][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 15:26                             ` [PATCH v3][RFC] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-06-13 15:28                               ` Nishanth Aravamudan
  2007-06-13 18:23                                 ` Lee Schermerhorn
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 15:28 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Christoph Lameter, lee.schermerhorn, anton, akpm, linux-mm

On 13.06.2007 [08:26:49 -0700], Nishanth Aravamudan wrote:
> On 12.06.2007 [17:04:46 -0700], Nishanth Aravamudan wrote:
> > On 12.06.2007 [12:13:47 -0700], William Lee Irwin III wrote:
> > > On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> > > >> For initially filling the pool one can just loop over nid's modulo the
> > > >> number of populated nodes and pass down a stack-allocated variable.
> > > 
> > > On Tue, Jun 12, 2007 at 10:45:03AM -0700, Nishanth Aravamudan wrote:
> > > > But how does one differentiate between "initally filling" the pool and a
> > > > later attempt to add to the pool (or even just marginally later).
> > > > I guess I don't see why folks are so against this static variable :) It
> > > > does the job and removing it seems like it could be an independent
> > > > cleanup?
> > > 
> > > Well, another approach is to just statically initialize it to something
> > > and then always check to make sure the node for the nid has memory, and
> > > if not, find the next nid with a node with memory from the populated map.
> > 
> > How does something like this look? Or is it overkill?
> 
> If that patch looks ok, then the other patches (numafy and sysfs) are
> relatively unchanged.

commit 05a7edb8c909c674cdefb0323348825cf3e2d1d0
Author: Nishanth Aravamudan <nacc@us.ibm.com>
Date:   Thu Jun 7 08:54:48 2007 -0700

hugetlb: add per-node nr_hugepages sysfs attribute

Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface). Add callbacks in the sysfs
node registration and unregistration functions into hugetlb to add the
nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Anton Blanchard <anton@sambar.org>
Cc: Andrew Morton <akpm@linux-foundation.org>

---
Do the dummy function definitions need to be (void)0?

diff --git a/drivers/base/node.c b/drivers/base/node.c
index cae346e..24b13b0 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -131,6 +131,8 @@ static ssize_t node_read_distance(struct sys_device * dev, char * buf)
 }
 static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
 
+#ifdef CONFIG_HUGETLB_PAGE
+#endif
 
 /*
  * register_node - Setup a sysfs device for a node.
@@ -151,6 +153,7 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+		hugetlb_register_node(node);
 	}
 	return error;
 }
@@ -168,6 +171,7 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_meminfo);
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
+	hugetlb_unregister_node(node);
 
 	sysdev_unregister(&node->sysdev);
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aa0dc9b..e9f5928 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -5,6 +5,7 @@
 
 #include <linux/mempolicy.h>
 #include <linux/shm.h>
+#include <linux/sysdev.h>
 #include <asm/tlbflush.h>
 
 struct ctl_table;
@@ -23,6 +24,11 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
+int hugetlb_register_node(struct sys_device *);
+void hugetlb_unregister_node(struct sys_device *);
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *, char *);
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *, const char *,
+					 size_t);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
@@ -114,6 +120,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
+#define hugetlb_register_node(node)		0
+#define hugetlb_unregister_node(node)		0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define prepare_hugepage_range(addr,len,pgoff)	(-EINVAL)
 #define pmd_huge(x)	0
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c4a966e..9d2c480 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -137,6 +137,9 @@ static int alloc_fresh_huge_page(struct mempolicy *policy)
 	nid = start_nid;
 
 	do {
+		/*
+		 * this allocation will fail for unpopulated nodes
+		 */
 		page = alloc_fresh_huge_page_node(nid);
 		nid = interleave_nodes(policy);
 	} while (!page && nid != start_nid);
@@ -217,7 +220,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 	return nr;
 }
 
-#ifdef CONFIG_SYSCTL
 static void update_and_free_page(int nid, struct page *page)
 {
 	int i;
@@ -270,6 +272,7 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
 
+#ifdef CONFIG_SYSCTL
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	struct mempolicy *pol;
@@ -343,6 +346,64 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
+SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+			hugetlb_read_nr_hugepages_node,
+			hugetlb_write_nr_hugepages_node);
+
+int hugetlb_register_node(struct node *node)
+{
+	return sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+void hugetlb_unregister_node(struct node *node)
+{
+	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+							char *buf)
+{
+	return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+					const char *buf, size_t count)
+{
+	int nid = dev->id;
+	unsigned long target;
+	unsigned long free_on_other_nodes;
+	unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+	while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+		if (!alloc_fresh_huge_page_node(nid))
+			return count;
+	}
+	if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+		return count;
+
+	/* need to ensure that our counts are accurate */
+	spin_lock(&hugetlb_lock);
+	free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+	if (free_on_other_nodes >= resv_huge_pages) {
+		/* other nodes can satisfy reserve */
+		target = nr_huge_pages_req;
+	} else {
+		/* this node needs some free to satisfy reserve */
+		target = max((resv_huge_pages - free_on_other_nodes),
+						nr_huge_pages_req);
+	}
+	try_to_free_low_node(nid, target);
+	while (target < nr_huge_pages_node[nid]) {
+		struct page *page = dequeue_huge_page_node(nid);
+		if (!page)
+			break;
+		update_and_free_page(nid, page);
+	}
+	spin_unlock(&hugetlb_lock);
+
+	return count;
+}
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {

> 
> commit 041cb3d3c2fd3640aff50e2f701b8b5a670193de
> Author: Nishanth Aravamudan <nacc@us.ibm.com>
> Date:   Tue Jun 12 17:10:21 2007 -0700
> 
> hugetlb: numafy several functions
> 
> Add node-parameterized helpers for dequeue_huge_page,
> alloc_fresh_huge_page and try_to_free_low. Also have
> update_and_free_page() take a nid parameter. This is necessary to add a
> per-node sysfs attribute to specify the number of hugepages on that
> node.
> 
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> Cc: William Lee Irwin III <wli@holomorphy.com>
> Cc: Christoph Lameter <clameter@sgi.com>
> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
> Cc: Anton Blanchard <anton@sambar.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1c13687..c4a966e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -66,11 +66,22 @@ static void enqueue_huge_page(struct page *page)
>  	free_huge_pages_node[nid]++;
>  }
>  
> +static struct page *dequeue_huge_page_node(int nid)
> +{
> +	struct page *page;
> +
> +	page = list_entry(hugepage_freelists[nid].next,
> +					  struct page, lru);
> +	list_del(&page->lru);
> +	free_huge_pages--;
> +	free_huge_pages_node[nid]--;
> +	return page;
> +}
> +
>  static struct page *dequeue_huge_page(struct vm_area_struct *vma,
>  				unsigned long address)
>  {
>  	int nid;
> -	struct page *page = NULL;
>  	struct zonelist *zonelist = huge_zonelist(vma, address,
>  						htlb_alloc_mask);
>  	struct zone **z;
> @@ -82,14 +93,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
>  			break;
>  	}
>  
> -	if (*z) {
> -		page = list_entry(hugepage_freelists[nid].next,
> -				  struct page, lru);
> -		list_del(&page->lru);
> -		free_huge_pages--;
> -		free_huge_pages_node[nid]--;
> -	}
> -	return page;
> +	if (*z)
> +		return dequeue_huge_page_node(nid);
> +	return NULL;
>  }
>  
>  static void free_huge_page(struct page *page)
> @@ -103,6 +109,25 @@ static void free_huge_page(struct page *page)
>  	spin_unlock(&hugetlb_lock);
>  }
>  
> +static struct page *alloc_fresh_huge_page_node(int nid)
> +{
> +	struct page *page;
> +
> +	page = alloc_pages_node(nid,
> +			GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
> +			HUGETLB_PAGE_ORDER);
> +	if (page) {
> +		set_compound_page_dtor(page, free_huge_page);
> +		spin_lock(&hugetlb_lock);
> +		nr_huge_pages++;
> +		nr_huge_pages_node[nid]++;
> +		spin_unlock(&hugetlb_lock);
> +		put_page(page); /* free it into the hugepage allocator */
> +	}
> +
> +	return page;
> +}
> +
>  static int alloc_fresh_huge_page(struct mempolicy *policy)
>  {
>  	int nid;
> @@ -112,20 +137,12 @@ static int alloc_fresh_huge_page(struct mempolicy *policy)
>  	nid = start_nid;
>  
>  	do {
> -		page = alloc_pages_node(nid,
> -				htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
> -				HUGETLB_PAGE_ORDER);
> +		page = alloc_fresh_huge_page_node(nid);
>  		nid = interleave_nodes(policy);
>  	} while (!page && nid != start_nid);
> -	if (page) {
> -		set_compound_page_dtor(page, free_huge_page);
> -		spin_lock(&hugetlb_lock);
> -		nr_huge_pages++;
> -		nr_huge_pages_node[page_to_nid(page)]++;
> -		spin_unlock(&hugetlb_lock);
> -		put_page(page); /* free it into the hugepage allocator */
> +
> +	if (page)
>  		return 1;
> -	}
>  	return 0;
>  }
>  
> @@ -201,11 +218,11 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
>  }
>  
>  #ifdef CONFIG_SYSCTL
> -static void update_and_free_page(struct page *page)
> +static void update_and_free_page(int nid, struct page *page)
>  {
>  	int i;
>  	nr_huge_pages--;
> -	nr_huge_pages_node[page_to_nid(page)]--;
> +	nr_huge_pages_node[nid]--;
>  	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
>  		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
>  				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
> @@ -217,25 +234,37 @@ static void update_and_free_page(struct page *page)
>  }
>  
>  #ifdef CONFIG_HIGHMEM
> +static void try_to_free_low_node(int nid, unsigned long count)
> +{
> +	struct page *page, *next;
> +
> +	list_for_each_entry_safe(page, next,
> +				&hugepage_freelists[nid], lru) {
> +		if (PageHighMem(page))
> +			continue;
> +		list_del(&page->lru);
> +		update_and_free_page(nid, page);
> +		free_huge_pages--;
> +		free_huge_pages_node[nid]--;
> +		if (count >= nr_huge_pages_node[nid])
> +			return;
> +	}
> +}
> +
>  static void try_to_free_low(unsigned long count)
>  {
>  	int i;
>  
>  	for (i = 0; i < MAX_NUMNODES; ++i) {
> -		struct page *page, *next;
> -		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
> -			if (PageHighMem(page))
> -				continue;
> -			list_del(&page->lru);
> -			update_and_free_page(page);
> -			free_huge_pages--;
> -			free_huge_pages_node[page_to_nid(page)]--;
> -			if (count >= nr_huge_pages)
> -				return;
> -		}
> +		try_to_free_low_node(i, count);
> +		if (count >= nr_huge_pages)
> +			break;
>  	}
>  }
>  #else
> +static inline void try_to_free_low_node(int nid, unsigned long count)
> +{
> +}
>  static inline void try_to_free_low(unsigned long count)
>  {
>  }
> @@ -263,7 +292,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
>  		struct page *page = dequeue_huge_page(NULL, 0);
>  		if (!page)
>  			break;
> -		update_and_free_page(page);
> +		update_and_free_page(page_to_nid(page), page);
>  	}
>  	spin_unlock(&hugetlb_lock);
>  	return nr_huge_pages;
> 
> -- 
> Nishanth Aravamudan <nacc@us.ibm.com>
> IBM Linux Technology Center

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-12 20:01                                           ` Nishanth Aravamudan
@ 2007-06-13 15:30                                             ` Lee Schermerhorn
  2007-06-13 17:58                                               ` Nishanth Aravamudan
  2007-06-13 22:49                                               ` Christoph Lameter
  0 siblings, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-13 15:30 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Christoph Lameter, anton, akpm, linux-mm

On Tue, 2007-06-12 at 13:01 -0700, Nishanth Aravamudan wrote:
> On 12.06.2007 [12:58:16 -0700], Christoph Lameter wrote:
> > On Tue, 12 Jun 2007, Christoph Lameter wrote:
> > 
> > > Uhhh... Right there is another special case. The recently 
> > > introduces zonelist swizzle makes the DMA zone come last and if a 
> > > node had only a DMA zone then it may become swizzled to the end of 
> > > the zonelist.
> > 
> > Maybe we can ignore that case for now:
> > 
I wish we wouldn't.  We need the "DMA zone comes last" for both HP and
Fujitsu platforms.  That's why Kame and I worked on that patch
together.  

> > 
> > Fix GFP_THISNODE behavior for memoryless nodes
> > 
> > GFP_THISNODE checks that the zone selected is within the pgdat (node) of the
> > first zone of a nodelist. That only works if the node has memory. A
> > memoryless node will have its first node on another pgdat (node).
> > 
> > GFP_THISNODE currently will return simply memory on the first pgdat.
> > Thus it is returning memory on other nodes. GFP_THISNODE should fail
> > if there is no local memory on a node.
> > 
> > So we add a check to verify that the node specified has memory in
> > alloc_pages_node(). If the node has no memory then return NULL.
> > 
> > The case of alloc_pages(GFP_THISNODE) is not changed. alloc_pages() (with no memory
> > policies in effect)
> > 
> > Signed-off-by: Christoph Lameter <clameter@sgi.com>
> > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > 
> > Index: linux-2.6.22-rc4-mm2/include/linux/gfp.h
> > ===================================================================
> > --- linux-2.6.22-rc4-mm2.orig/include/linux/gfp.h	2007-06-12 12:33:37.000000000 -0700
> > +++ linux-2.6.22-rc4-mm2/include/linux/gfp.h	2007-06-12 12:38:37.000000000 -0700
> > @@ -175,6 +175,13 @@ static inline struct page *alloc_pages_n
> >  	if (nid < 0)
> >  		nid = numa_node_id();
> > 
> > +	/*
> > +	 * Check for the special case that GFP_THISNODE is used on a
> > +	 * memoryless node
> > +	 */
> > +	if ((gfp_mask & __GFP_THISNODE) && !node_memory(nid))
> > +		return NULL;
> > +
> 
> Yep, this seems to be the right thing to do, and was in my rolled-up
> patch.

I think that the "node has memory" mask is fine for scanning nodes that
might have memory in the zone of interest--including in the hugetlb
alloc_fresh_huge_page() loop.  However, I think that to support all
platforms in a generic way, alloc_pages_node() and
alloc_page_interleave() [both take a node id arg] should be more strict
when the gfp mask includes 'THISNODE and not assume that a populated
node always has on-node memory in the zone of interest.  E.g., something
like:

	pgdat_t *pgdat;
	struct zonelist *zonelist;

	...

	/* 
	 * after validating nid, ... 
	 * Note that we need to fetch these values anyway for the
	 * [likely?] call to __alloc_pages().  
	 */
	pgdat = NODE_DATA(nid);
	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);

	if ((gfp_mask & __GFP_THISNODE) &&
		zonelist->zones[0]->zone_pgdat != pgdat)
		return NULL;
	
	return __alloc_pages(gfp_mask, order, zonelist);


I see you've submitted a new patch set.  I grab it [when Nish reposts]
and test it as is and modified to look something like the above, if
needed.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 15:30                                             ` Lee Schermerhorn
@ 2007-06-13 17:58                                               ` Nishanth Aravamudan
  2007-06-13 18:21                                                 ` Lee Schermerhorn
  2007-06-13 22:50                                                 ` Christoph Lameter
  2007-06-13 22:49                                               ` Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 17:58 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm

On 13.06.2007 [11:30:06 -0400], Lee Schermerhorn wrote:
> On Tue, 2007-06-12 at 13:01 -0700, Nishanth Aravamudan wrote:
> > On 12.06.2007 [12:58:16 -0700], Christoph Lameter wrote:
> > > On Tue, 12 Jun 2007, Christoph Lameter wrote:
> > > 
> > > > Uhhh... Right there is another special case. The recently 
> > > > introduces zonelist swizzle makes the DMA zone come last and if a 
> > > > node had only a DMA zone then it may become swizzled to the end of 
> > > > the zonelist.
> > > 
> > > Maybe we can ignore that case for now:
> > > 
> I wish we wouldn't.  We need the "DMA zone comes last" for both HP and
> Fujitsu platforms.  That's why Kame and I worked on that patch
> together.  

Right. I interpreted the "for now" as for this first stack of patches.
We'll need a fix for your platform on top, but it seems to be a minority
case? Not saying it shouldn't be fixed, by any means, just trying to get
a handle on it.

> > > Fix GFP_THISNODE behavior for memoryless nodes
> > > 
> > > GFP_THISNODE checks that the zone selected is within the pgdat (node) of the
> > > first zone of a nodelist. That only works if the node has memory. A
> > > memoryless node will have its first node on another pgdat (node).
> > > 
> > > GFP_THISNODE currently will return simply memory on the first pgdat.
> > > Thus it is returning memory on other nodes. GFP_THISNODE should fail
> > > if there is no local memory on a node.
> > > 
> > > So we add a check to verify that the node specified has memory in
> > > alloc_pages_node(). If the node has no memory then return NULL.
> > > 
> > > The case of alloc_pages(GFP_THISNODE) is not changed. alloc_pages() (with no memory
> > > policies in effect)
> > > 
> > > Signed-off-by: Christoph Lameter <clameter@sgi.com>
> > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > 
> > > Index: linux-2.6.22-rc4-mm2/include/linux/gfp.h
> > > ===================================================================
> > > --- linux-2.6.22-rc4-mm2.orig/include/linux/gfp.h	2007-06-12 12:33:37.000000000 -0700
> > > +++ linux-2.6.22-rc4-mm2/include/linux/gfp.h	2007-06-12 12:38:37.000000000 -0700
> > > @@ -175,6 +175,13 @@ static inline struct page *alloc_pages_n
> > >  	if (nid < 0)
> > >  		nid = numa_node_id();
> > > 
> > > +	/*
> > > +	 * Check for the special case that GFP_THISNODE is used on a
> > > +	 * memoryless node
> > > +	 */
> > > +	if ((gfp_mask & __GFP_THISNODE) && !node_memory(nid))
> > > +		return NULL;
> > > +
> > 
> > Yep, this seems to be the right thing to do, and was in my rolled-up
> > patch.
> 
> I think that the "node has memory" mask is fine for scanning nodes
> that might have memory in the zone of interest--including in the
> hugetlb alloc_fresh_huge_page() loop.  However, I think that to
> support all platforms in a generic way, alloc_pages_node() and
> alloc_page_interleave() [both take a node id arg] should be more
> strict when the gfp mask includes 'THISNODE and not assume that a
> populated node always has on-node memory in the zone of interest.

Hrm, perhaps.

> E.g., something like:
> 
> 	pgdat_t *pgdat;
> 	struct zonelist *zonelist;
> 
> 	...
> 
> 	/* 
> 	 * after validating nid, ... 
> 	 * Note that we need to fetch these values anyway for the
> 	 * [likely?] call to __alloc_pages().  
> 	 */
> 	pgdat = NODE_DATA(nid);
> 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> 
> 	if ((gfp_mask & __GFP_THISNODE) &&
> 		zonelist->zones[0]->zone_pgdat != pgdat)
> 		return NULL;
> 	
> 	return __alloc_pages(gfp_mask, order, zonelist);
> 
> 
> I see you've submitted a new patch set.  I grab it [when Nish reposts]
> and test it as is and modified to look something like the above, if
> needed.

I think your code above makes sense -- I'd still leave in the earlier
check, though.

So it probably should be:

	pgdat = NODE_DATA(nid);
	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);

	if (unlikely((gfp_mask & __GFP_THISNODE) &&
		(!node_memory(nid) ||
		 zonelist->zones[0]->zone_pgdat != pgdat)))
		 return NULL;

That way, if the node has no memory whatsoever, we don't bother checking
the pgdat of the relevant zone?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 17:58                                               ` Nishanth Aravamudan
@ 2007-06-13 18:21                                                 ` Lee Schermerhorn
  2007-06-13 19:01                                                   ` Nishanth Aravamudan
  2007-06-13 22:51                                                   ` Christoph Lameter
  2007-06-13 22:50                                                 ` Christoph Lameter
  1 sibling, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-13 18:21 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Christoph Lameter, anton, akpm, linux-mm

On Wed, 2007-06-13 at 10:58 -0700, Nishanth Aravamudan wrote:
> On 13.06.2007 [11:30:06 -0400], Lee Schermerhorn wrote:
> > On Tue, 2007-06-12 at 13:01 -0700, Nishanth Aravamudan wrote:
> > > On 12.06.2007 [12:58:16 -0700], Christoph Lameter wrote:
> > > > On Tue, 12 Jun 2007, Christoph Lameter wrote:
> > > > 
> > > > > Uhhh... Right there is another special case. The recently 
> > > > > introduces zonelist swizzle makes the DMA zone come last and if a 
> > > > > node had only a DMA zone then it may become swizzled to the end of 
> > > > > the zonelist.
> > > > 
> > > > Maybe we can ignore that case for now:
> > > > 
> > I wish we wouldn't.  We need the "DMA zone comes last" for both HP and
> > Fujitsu platforms.  That's why Kame and I worked on that patch
> > together.  
> 
> Right. I interpreted the "for now" as for this first stack of patches.
> We'll need a fix for your platform on top, but it seems to be a minority
> case? Not saying it shouldn't be fixed, by any means, just trying to get
> a handle on it.

Yep.  I'm testing the stack "as is" now.  If it doesn't spread the huge
pages evenly because of our funky DMA-only node, I'll post a fix up
patch for consideration.

By the way, your sysfs attribute patch doesn't compile.  I'll post
comments/fixes in response to your message that submitted the patch.

> 
<snip>

> > 
> > I think that the "node has memory" mask is fine for scanning nodes
> > that might have memory in the zone of interest--including in the
> > hugetlb alloc_fresh_huge_page() loop.  However, I think that to
> > support all platforms in a generic way, alloc_pages_node() and
> > alloc_page_interleave() [both take a node id arg] should be more
> > strict when the gfp mask includes 'THISNODE and not assume that a
> > populated node always has on-node memory in the zone of interest.
> 
> Hrm, perhaps.
> 
> > E.g., something like:
> > 
> > 	pgdat_t *pgdat;
> > 	struct zonelist *zonelist;
> > 
> > 	...
> > 
> > 	/* 
> > 	 * after validating nid, ... 
> > 	 * Note that we need to fetch these values anyway for the
> > 	 * [likely?] call to __alloc_pages().  
> > 	 */
> > 	pgdat = NODE_DATA(nid);
> > 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> > 
> > 	if ((gfp_mask & __GFP_THISNODE) &&
> > 		zonelist->zones[0]->zone_pgdat != pgdat)
> > 		return NULL;
> > 	
> > 	return __alloc_pages(gfp_mask, order, zonelist);
> > 
> > 
> > I see you've submitted a new patch set.  I grab it [when Nish reposts]
> > and test it as is and modified to look something like the above, if
> > needed.
> 
> I think your code above makes sense -- I'd still leave in the earlier
> check, though.
> 
> So it probably should be:
> 
> 	pgdat = NODE_DATA(nid);
> 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> 
> 	if (unlikely((gfp_mask & __GFP_THISNODE) &&
> 		(!node_memory(nid) ||
> 		 zonelist->zones[0]->zone_pgdat != pgdat)))
> 		 return NULL;
> 
> That way, if the node has no memory whatsoever, we don't bother checking
> the pgdat of the relevant zone?

Well, since most nodes WILL, I think, have memory, that just adds an
extra check in the most frequent case.  Then, we'll have to go ahead and
check the pgdat.  However, if the first zone in the selected zonelist IS
"on-node" [pgdats match], we know that the node has memory [altho' the
zone may not have available pages].  And since we have to fetch the
pgdat and the zonelist, anyway, as the argument to __alloc_pages(), I
don't think my proposed change adds any additional memory ref's, while
eliminating the ref to the node_memory_map.  I'm assuming here that the
compiler will optimize away any stores to the pgdat/zonelist variables.

So, we can use the node_memory() test at higher levels--like the
alloc_fresh_huge_page() loop, to avoid attempting allocations from nodes
that we know have no memory, but I think the allocate_pages_node() and
allocate_interleave_page() should test the selected zonelist explicitly.

Lee


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v3][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 15:28                               ` [PATCH v3][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
@ 2007-06-13 18:23                                 ` Lee Schermerhorn
  2007-06-13 19:19                                   ` [PATCH v4][RFC] " Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-13 18:23 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: William Lee Irwin III, Christoph Lameter, anton, akpm, linux-mm

On Wed, 2007-06-13 at 08:28 -0700, Nishanth Aravamudan wrote:
<snip>
> 
> commit 05a7edb8c909c674cdefb0323348825cf3e2d1d0
> Author: Nishanth Aravamudan <nacc@us.ibm.com>
> Date:   Thu Jun 7 08:54:48 2007 -0700
> 
> hugetlb: add per-node nr_hugepages sysfs attribute
> 
> Allow specifying the number of hugepages to allocate on a particular
> node. Our current global sysctl will try its best to put hugepages
> equally on each node, but htat may not always be desired. This allows
> the admin to control the layout of hugepage allocation at a finer level
> (while not breaking the existing interface). Add callbacks in the sysfs
> node registration and unregistration functions into hugetlb to add the
> nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
> 
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> Cc: William Lee Irwin III <wli@holomorphy.com>
> Cc: Christoph Lameter <clameter@sgi.com>
> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
> Cc: Anton Blanchard <anton@sambar.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> 
> ---
> Do the dummy function definitions need to be (void)0?
> 

<snip>

> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index aa0dc9b..e9f5928 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -5,6 +5,7 @@
>  
>  #include <linux/mempolicy.h>
>  #include <linux/shm.h>
> +#include <linux/sysdev.h>
>  #include <asm/tlbflush.h>
>  
>  struct ctl_table;
> @@ -23,6 +24,11 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
>  int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
>  int hugetlb_report_meminfo(char *);
>  int hugetlb_report_node_meminfo(int, char *);
> +int hugetlb_register_node(struct sys_device *);
> +void hugetlb_unregister_node(struct sys_device *);

The parameter type for the two functions above need to be "struct node".
You'll need to include <linux/node.h> after <linux/sysdev.h>, as well.
Otherwise, doesn't build.


<snip>

Still testing...

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 18:21                                                 ` Lee Schermerhorn
@ 2007-06-13 19:01                                                   ` Nishanth Aravamudan
  2007-06-13 22:51                                                   ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 19:01 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm

On 13.06.2007 [14:21:14 -0400], Lee Schermerhorn wrote:
> On Wed, 2007-06-13 at 10:58 -0700, Nishanth Aravamudan wrote:
> > On 13.06.2007 [11:30:06 -0400], Lee Schermerhorn wrote:
> > > On Tue, 2007-06-12 at 13:01 -0700, Nishanth Aravamudan wrote:
> > > > On 12.06.2007 [12:58:16 -0700], Christoph Lameter wrote:
> > > > > On Tue, 12 Jun 2007, Christoph Lameter wrote:
> > > > > 
> > > > > > Uhhh... Right there is another special case. The recently 
> > > > > > introduces zonelist swizzle makes the DMA zone come last and if a 
> > > > > > node had only a DMA zone then it may become swizzled to the end of 
> > > > > > the zonelist.
> > > > > 
> > > > > Maybe we can ignore that case for now:
> > > > > 
> > > I wish we wouldn't.  We need the "DMA zone comes last" for both HP and
> > > Fujitsu platforms.  That's why Kame and I worked on that patch
> > > together.  
> > 
> > Right. I interpreted the "for now" as for this first stack of patches.
> > We'll need a fix for your platform on top, but it seems to be a minority
> > case? Not saying it shouldn't be fixed, by any means, just trying to get
> > a handle on it.
> 
> Yep.  I'm testing the stack "as is" now.  If it doesn't spread the
> huge pages evenly because of our funky DMA-only node, I'll post a fix
> up patch for consideration.

Great, thanks.

> By the way, your sysfs attribute patch doesn't compile.  I'll post
> comments/fixes in response to your message that submitted the patch.

Dang, much appreciated.

> <snip>
> 
> > > 
> > > I think that the "node has memory" mask is fine for scanning nodes
> > > that might have memory in the zone of interest--including in the
> > > hugetlb alloc_fresh_huge_page() loop.  However, I think that to
> > > support all platforms in a generic way, alloc_pages_node() and
> > > alloc_page_interleave() [both take a node id arg] should be more
> > > strict when the gfp mask includes 'THISNODE and not assume that a
> > > populated node always has on-node memory in the zone of interest.
> > 
> > Hrm, perhaps.
> > 
> > > E.g., something like:
> > > 
> > > 	pgdat_t *pgdat;
> > > 	struct zonelist *zonelist;
> > > 
> > > 	...
> > > 
> > > 	/* 
> > > 	 * after validating nid, ... 
> > > 	 * Note that we need to fetch these values anyway for the
> > > 	 * [likely?] call to __alloc_pages().  
> > > 	 */
> > > 	pgdat = NODE_DATA(nid);
> > > 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> > > 
> > > 	if ((gfp_mask & __GFP_THISNODE) &&
> > > 		zonelist->zones[0]->zone_pgdat != pgdat)
> > > 		return NULL;
> > > 	
> > > 	return __alloc_pages(gfp_mask, order, zonelist);
> > > 
> > > 
> > > I see you've submitted a new patch set.  I grab it [when Nish reposts]
> > > and test it as is and modified to look something like the above, if
> > > needed.
> > 
> > I think your code above makes sense -- I'd still leave in the earlier
> > check, though.
> > 
> > So it probably should be:
> > 
> > 	pgdat = NODE_DATA(nid);
> > 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> > 
> > 	if (unlikely((gfp_mask & __GFP_THISNODE) &&
> > 		(!node_memory(nid) ||
> > 		 zonelist->zones[0]->zone_pgdat != pgdat)))
> > 		 return NULL;
> > 
> > That way, if the node has no memory whatsoever, we don't bother checking
> > the pgdat of the relevant zone?
> 
> Well, since most nodes WILL, I think, have memory, that just adds an
> extra check in the most frequent case.  Then, we'll have to go ahead
> and check the pgdat.  However, if the first zone in the selected
> zonelist IS "on-node" [pgdats match], we know that the node has memory
> [altho' the zone may not have available pages].  And since we have to
> fetch the pgdat and the zonelist, anyway, as the argument to
> __alloc_pages(), I don't think my proposed change adds any additional
> memory ref's, while eliminating the ref to the node_memory_map.  I'm
> assuming here that the compiler will optimize away any stores to the
> pgdat/zonelist variables.

You're right, sorry for the noise.

> So, we can use the node_memory() test at higher levels--like the
> alloc_fresh_huge_page() loop, to avoid attempting allocations from
> nodes that we know have no memory, but I think the
> allocate_pages_node() and allocate_interleave_page() should test the
> selected zonelist explicitly.

Yep.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v4][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 18:23                                 ` Lee Schermerhorn
@ 2007-06-13 19:19                                   ` Nishanth Aravamudan
  2007-06-13 20:05                                     ` Lee Schermerhorn
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 19:19 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: William Lee Irwin III, Christoph Lameter, anton, akpm, linux-mm

On 13.06.2007 [14:23:47 -0400], Lee Schermerhorn wrote:
> On Wed, 2007-06-13 at 08:28 -0700, Nishanth Aravamudan wrote:
> <snip>
> > 
> > commit 05a7edb8c909c674cdefb0323348825cf3e2d1d0
> > Author: Nishanth Aravamudan <nacc@us.ibm.com>
> > Date:   Thu Jun 7 08:54:48 2007 -0700
> > 
> > hugetlb: add per-node nr_hugepages sysfs attribute
> > 
> > Allow specifying the number of hugepages to allocate on a particular
> > node. Our current global sysctl will try its best to put hugepages
> > equally on each node, but htat may not always be desired. This allows
> > the admin to control the layout of hugepage allocation at a finer level
> > (while not breaking the existing interface). Add callbacks in the sysfs
> > node registration and unregistration functions into hugetlb to add the
> > nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
> > 
> > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > Cc: William Lee Irwin III <wli@holomorphy.com>
> > Cc: Christoph Lameter <clameter@sgi.com>
> > Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
> > Cc: Anton Blanchard <anton@sambar.org>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > 
> > ---
> > Do the dummy function definitions need to be (void)0?
> > 
> 
> <snip>
> 
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index aa0dc9b..e9f5928 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -5,6 +5,7 @@
> >  
> >  #include <linux/mempolicy.h>
> >  #include <linux/shm.h>
> > +#include <linux/sysdev.h>
> >  #include <asm/tlbflush.h>
> >  
> >  struct ctl_table;
> > @@ -23,6 +24,11 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
> >  int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
> >  int hugetlb_report_meminfo(char *);
> >  int hugetlb_report_node_meminfo(int, char *);
> > +int hugetlb_register_node(struct sys_device *);
> > +void hugetlb_unregister_node(struct sys_device *);
> 
> The parameter type for the two functions above need to be "struct
> node".  You'll need to include <linux/node.h> after <linux/sysdev.h>,
> as well.  Otherwise, doesn't build.

Sigh... Actually a few fixes worth doing. Make stuff static, since it's
now all in hugetlb.c and only compile if NUMA. And don't export the
nr_hugepages functions any more via hugetlb.h, as they are now private.

Compile-tested with HUGETLB && NUMA, HUGETLB && !NUMA, !HUGETLB && NUMA,
!HUGETLB && !NUMA.

Will throw it at the machines I ran the previous set on, to verify
everything runs as expected, but for review:


hugetlb: add per-node nr_hugepages sysfs attribute

Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface).  Add callbacks in the sysfs
node registration and unregistration functions into hugetlb to add the
nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>

diff --git a/drivers/base/node.c b/drivers/base/node.c
index cae346e..c9d531f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -151,6 +151,7 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+		hugetlb_register_node(node);
 	}
 	return error;
 }
@@ -168,6 +169,7 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_meminfo);
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
+	hugetlb_unregister_node(node);
 
 	sysdev_unregister(&node->sysdev);
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aa0dc9b..7872031 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -4,7 +4,9 @@
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
+#include <linux/node.h>
 #include <linux/shm.h>
+#include <linux/sysdev.h>
 #include <asm/tlbflush.h>
 
 struct ctl_table;
@@ -23,6 +25,13 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
+#ifdef CONFIG_NUMA
+int hugetlb_register_node(struct node *);
+void hugetlb_unregister_node(struct node *);
+#else
+#define hugetlb_register_node(node)		0
+#define hugetlb_unregister_node(node)		0
+#endif
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
@@ -114,6 +123,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
+#define hugetlb_register_node(node)		0
+#define hugetlb_unregister_node(node)		0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define prepare_hugepage_range(addr,len,pgoff)	(-EINVAL)
 #define pmd_huge(x)	0
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c4a966e..e6ba07d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -137,6 +137,9 @@ static int alloc_fresh_huge_page(struct mempolicy *policy)
 	nid = start_nid;
 
 	do {
+		/*
+		 * this allocation will fail for unpopulated nodes
+		 */
 		page = alloc_fresh_huge_page_node(nid);
 		nid = interleave_nodes(policy);
 	} while (!page && nid != start_nid);
@@ -217,7 +220,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 	return nr;
 }
 
-#ifdef CONFIG_SYSCTL
 static void update_and_free_page(int nid, struct page *page)
 {
 	int i;
@@ -270,6 +272,7 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
 
+#ifdef CONFIG_SYSCTL
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	struct mempolicy *pol;
@@ -343,6 +346,67 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
+#ifdef CONFIG_NUMA
+static ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+							char *buf)
+{
+	return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+					const char *buf, size_t count)
+{
+	int nid = dev->id;
+	unsigned long target;
+	unsigned long free_on_other_nodes;
+	unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+	while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+		if (!alloc_fresh_huge_page_node(nid))
+			return count;
+	}
+	if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+		return count;
+
+	/* need to ensure that our counts are accurate */
+	spin_lock(&hugetlb_lock);
+	free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+	if (free_on_other_nodes >= resv_huge_pages) {
+		/* other nodes can satisfy reserve */
+		target = nr_huge_pages_req;
+	} else {
+		/* this node needs some free to satisfy reserve */
+		target = max((resv_huge_pages - free_on_other_nodes),
+						nr_huge_pages_req);
+	}
+	try_to_free_low_node(nid, target);
+	while (target < nr_huge_pages_node[nid]) {
+		struct page *page = dequeue_huge_page_node(nid);
+		if (!page)
+			break;
+		update_and_free_page(nid, page);
+	}
+	spin_unlock(&hugetlb_lock);
+
+	return count;
+}
+
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+			hugetlb_read_nr_hugepages_node,
+			hugetlb_write_nr_hugepages_node);
+
+int hugetlb_register_node(struct node *node)
+{
+	return sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+void hugetlb_unregister_node(struct node *node)
+{
+	sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+#endif
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v4][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 19:19                                   ` [PATCH v4][RFC] " Nishanth Aravamudan
@ 2007-06-13 20:05                                     ` Lee Schermerhorn
  2007-06-13 20:29                                       ` Nishanth Aravamudan
  2007-07-23 19:23                                       ` Christoph Lameter
  0 siblings, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-13 20:05 UTC (permalink / raw)
  To: Nishanth Aravamudan, Christoph Lameter
  Cc: William Lee Irwin III, anton, akpm, linux-mm

On Wed, 2007-06-13 at 12:19 -0700, Nishanth Aravamudan wrote:
> On 13.06.2007 [14:23:47 -0400], Lee Schermerhorn wrote:
> > On Wed, 2007-06-13 at 08:28 -0700, Nishanth Aravamudan wrote:
> > <snip>
> > > 
> > > commit 05a7edb8c909c674cdefb0323348825cf3e2d1d0
> > > Author: Nishanth Aravamudan <nacc@us.ibm.com>
> > > Date:   Thu Jun 7 08:54:48 2007 -0700
> > > 
> > > hugetlb: add per-node nr_hugepages sysfs attribute
> > > 
> > > Allow specifying the number of hugepages to allocate on a particular
> > > node. Our current global sysctl will try its best to put hugepages
> > > equally on each node, but htat may not always be desired. This allows
> > > the admin to control the layout of hugepage allocation at a finer level
> > > (while not breaking the existing interface). Add callbacks in the sysfs
> > > node registration and unregistration functions into hugetlb to add the
> > > nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
> > > 
> > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > Cc: William Lee Irwin III <wli@holomorphy.com>
> > > Cc: Christoph Lameter <clameter@sgi.com>
> > > Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
> > > Cc: Anton Blanchard <anton@sambar.org>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > 
> > > ---
> > > Do the dummy function definitions need to be (void)0?
> > > 
> > 
> > <snip>

I tested hugepage allocation on my HP rx8620 platform [16 cpu ia64, 32GB
in 4 "real" nodes and one pseudo-node containing only DMA memory].  As
expected, I don't get a balanced distribution across the real nodes.
Here's what I see:

# before allocating huge pages:
root@gwydyr(root):cat /sys/devices/system/node/node*/meminfo | grep HugeP 
Node 0 HugePages_Total:     0
Node 0 HugePages_Free:      0
Node 1 HugePages_Total:     0
Node 1 HugePages_Free:      0
Node 2 HugePages_Total:     0
Node 2 HugePages_Free:      0
Node 3 HugePages_Total:     0
Node 3 HugePages_Free:      0
Node 4 HugePages_Total:     0
Node 4 HugePages_Free:      0

# Now allocate 64 256MB pages.  Only nodes 0-3 have NORMAL memory.
# Zone 4 contains ~512MB of DMA memory.  Some has already been
# used, so I doubt that even 1 256MB [aligned] huge page is available.

root@gwydyr(root):echo 64 >/proc/sys/vm/nr_hugepages
root@gwydyr(root):cat /sys/devices/system/node/node*/meminfo | grep HugeP
Node 0 HugePages_Total:    13	<---???
Node 0 HugePages_Free:     26	<---???
Node 1 HugePages_Total:    12
Node 1 HugePages_Free:     12
Node 2 HugePages_Total:    13
Node 2 HugePages_Free:     13
Node 3 HugePages_Total:    13
Node 3 HugePages_Free:     13
Node 4 HugePages_Total:    13	<---???
Node 4 HugePages_Free:      0

# 13 of the pages say they're from Node 4, but I know that has only
~512MB or memory, of which some is already used.  Unlikely that I can
allocate even 1 256MB huge page because of alignment.  Note that the
free pages are accounted on Node 0, where they actually reside.

Here's some zoneinfo after the allocation above [forgot to snap it
before].

# zoneinfo shell function contains:
# cat /proc/zoneinfo | egrep '^Node|^  pages |^  *present|^  *spanned'
# results after allocating huge pages
root@gwydyr(root):zoneinfo
Node 0, zone   Normal
  pages free     36157
        spanned  486400
        present  484738
Node 1, zone   Normal
  pages free     318034
        spanned  520192
        present  518413
Node 2, zone   Normal
  pages free     301526
        spanned  520192
        present  518414
Node 3, zone   Normal
  pages free     301932
        spanned  520182
        present  518362
Node 4, zone      DMA
  pages free     31706
        spanned  32767
        present  32656
^^^^^^^^^^^^^^^^^^^^^^ Nope!  no huge pages allocated from here!

# now try to free the huge pages.

root@gwydyr(root):echo 0 >/proc/sys/vm/nr_hugepages
root@gwydyr(root):cat /sys/devices/system/node/node*/meminfo | grep HugeP
Node 0 HugePages_Total: 4294967283 <--- ???
Node 0 HugePages_Free:      0
Node 1 HugePages_Total:     0
Node 1 HugePages_Free:      0
Node 2 HugePages_Total:     0
Node 2 HugePages_Free:      0
Node 3 HugePages_Total:     0
Node 3 HugePages_Free:      0
Node 4 HugePages_Total:    13	<---??? they weren't really there to begin with!
Node 4 HugePages_Free:      0

# Apparently on remove, the pages were decremented from node 0 instead
of node 4 where they were accounted for on allocation, resulting in a
negative count on node 0 and the original 13 count still on node 4.  

------------------

I tried to "tighten up"  alloc_pages_node() to check the location of the
first zone in the selected zonelist, as discussed in previous exchange.
When I do this, I hit a BUG() in slub.c in
early_kmem_cache_node_alloc(), as it apparently can't handle new_slab()
returning a NULL page, even tho' it calls it with GFP_THISNODE.  Slub
should be able to handle memoryless nodes, right?  I'm looking for a
work around to this now.

Lee







--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v4][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 20:05                                     ` Lee Schermerhorn
@ 2007-06-13 20:29                                       ` Nishanth Aravamudan
  2007-06-13 21:02                                         ` Lee Schermerhorn
  2007-07-23 19:23                                       ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 20:29 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Christoph Lameter, William Lee Irwin III, anton, akpm, linux-mm

On 13.06.2007 [16:05:10 -0400], Lee Schermerhorn wrote:
> On Wed, 2007-06-13 at 12:19 -0700, Nishanth Aravamudan wrote:
> > On 13.06.2007 [14:23:47 -0400], Lee Schermerhorn wrote:
> > > On Wed, 2007-06-13 at 08:28 -0700, Nishanth Aravamudan wrote:
> > > <snip>
> > > > 
> > > > commit 05a7edb8c909c674cdefb0323348825cf3e2d1d0
> > > > Author: Nishanth Aravamudan <nacc@us.ibm.com>
> > > > Date:   Thu Jun 7 08:54:48 2007 -0700
> > > > 
> > > > hugetlb: add per-node nr_hugepages sysfs attribute
> > > > 
> > > > Allow specifying the number of hugepages to allocate on a particular
> > > > node. Our current global sysctl will try its best to put hugepages
> > > > equally on each node, but htat may not always be desired. This allows
> > > > the admin to control the layout of hugepage allocation at a finer level
> > > > (while not breaking the existing interface). Add callbacks in the sysfs
> > > > node registration and unregistration functions into hugetlb to add the
> > > > nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
> > > > 
> > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > > Cc: William Lee Irwin III <wli@holomorphy.com>
> > > > Cc: Christoph Lameter <clameter@sgi.com>
> > > > Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
> > > > Cc: Anton Blanchard <anton@sambar.org>
> > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > 
> > > > ---
> > > > Do the dummy function definitions need to be (void)0?
> > > > 
> > > 
> > > <snip>
> 
> I tested hugepage allocation on my HP rx8620 platform [16 cpu ia64,
> 32GB in 4 "real" nodes and one pseudo-node containing only DMA
> memory].  As expected, I don't get a balanced distribution across the
> real nodes.  Here's what I see:

Hrm, not good.

Can you try without any of my add-on patches, but just the original set
from Christoph?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v4][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 20:29                                       ` Nishanth Aravamudan
@ 2007-06-13 21:02                                         ` Lee Schermerhorn
  0 siblings, 0 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-13 21:02 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: Christoph Lameter, William Lee Irwin III, anton, akpm, linux-mm

On Wed, 2007-06-13 at 13:29 -0700, Nishanth Aravamudan wrote:
> On 13.06.2007 [16:05:10 -0400], Lee Schermerhorn wrote:
> > On Wed, 2007-06-13 at 12:19 -0700, Nishanth Aravamudan wrote:
> > > On 13.06.2007 [14:23:47 -0400], Lee Schermerhorn wrote:
> > > > On Wed, 2007-06-13 at 08:28 -0700, Nishanth Aravamudan wrote:
> > > > <snip>
> > > > > 
> > > > > commit 05a7edb8c909c674cdefb0323348825cf3e2d1d0
> > > > > Author: Nishanth Aravamudan <nacc@us.ibm.com>
> > > > > Date:   Thu Jun 7 08:54:48 2007 -0700
> > > > > 
> > > > > hugetlb: add per-node nr_hugepages sysfs attribute
> > > > > 
> > > > > Allow specifying the number of hugepages to allocate on a particular
> > > > > node. Our current global sysctl will try its best to put hugepages
> > > > > equally on each node, but htat may not always be desired. This allows
> > > > > the admin to control the layout of hugepage allocation at a finer level
> > > > > (while not breaking the existing interface). Add callbacks in the sysfs
> > > > > node registration and unregistration functions into hugetlb to add the
> > > > > nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
> > > > > 
> > > > > Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> > > > > Cc: William Lee Irwin III <wli@holomorphy.com>
> > > > > Cc: Christoph Lameter <clameter@sgi.com>
> > > > > Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
> > > > > Cc: Anton Blanchard <anton@sambar.org>
> > > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > > 
> > > > > ---
> > > > > Do the dummy function definitions need to be (void)0?
> > > > > 
> > > > 
> > > > <snip>
> > 
> > I tested hugepage allocation on my HP rx8620 platform [16 cpu ia64,
> > 32GB in 4 "real" nodes and one pseudo-node containing only DMA
> > memory].  As expected, I don't get a balanced distribution across the
> > real nodes.  Here's what I see:
> 
> Hrm, not good.
> 
> Can you try without any of my add-on patches, but just the original set
> from Christoph?

I can do that.  However, I've tested with two patches:  one to the
"GFP_THISNODE" behavior in alloc_pages_node(), and one for interleaving
in your "fix hugetlb pool allocation..." patch.  With these 2 patches,
hugetlb allocation appears to work on my platform, at least via the
vm.nr_hugepages sysctl.  Haven't tried your per node attribute yet, and
I'm just rebooting to try the command line.  I'll try out the x86_64
platform tomorrow.

I'll post the fixes in response to the respective patches from you and
Christoph.

Lee


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v7][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-13  0:04                           ` [PATCH v7][RFC] " Nishanth Aravamudan
  2007-06-13 15:26                             ` [PATCH v3][RFC] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-06-13 21:04                             ` Lee Schermerhorn
  2007-06-13 21:50                               ` [PATCH v7][UPDATE][RFC] " Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-13 21:04 UTC (permalink / raw)
  To: Nishanth Aravamudan
  Cc: William Lee Irwin III, Christoph Lameter, anton, akpm, linux-mm

On Tue, 2007-06-12 at 17:04 -0700, Nishanth Aravamudan wrote:
> On 12.06.2007 [12:13:47 -0700], William Lee Irwin III wrote:
> > On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> > >> For initially filling the pool one can just loop over nid's modulo the
> > >> number of populated nodes and pass down a stack-allocated variable.
> > 
> > On Tue, Jun 12, 2007 at 10:45:03AM -0700, Nishanth Aravamudan wrote:
> > > But how does one differentiate between "initally filling" the pool and a
> > > later attempt to add to the pool (or even just marginally later).
> > > I guess I don't see why folks are so against this static variable :) It
> > > does the job and removing it seems like it could be an independent
> > > cleanup?
> > 
> > Well, another approach is to just statically initialize it to something
> > and then always check to make sure the node for the nid has memory, and
> > if not, find the next nid with a node with memory from the populated map.
> 
> How does something like this look? Or is it overkill?
> 
> [PATCH 2.6.22-rc4-mm2] Fix hugetlb pool allocation with empty nodes V7
> 
> Anton found a problem with the hugetlb pool allocation when some nodes
> have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee
> worked on versions that tried to fix it, but none were accepted.
> Christoph has created a set of patches which allow for GFP_THISNODE
> allocations to fail if the node has no memory and for exporting a
> node_memory_map indicating which nodes have memory. Since mempolicy.c
> already has a number of functions which support interleaving, create a
> mempolicy when we invoke alloc_fresh_huge_page() that specifies
> interleaving across all the nodes in node_memory_map, rather than custom
> interleaving code in hugetlb.c.  This requires adding some dummy
> functions, and some declarations, in mempolicy.h to compile with NUMA or
> !NUMA.
> 
> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
> Cc: Anton Blanchard <anton@samba.org>
> Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
> Cc: Christoph Lameter <clameter@sgi.com>
> Cc: William Lee Irwin III <wli@holomorphy.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> 
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index 22b668c..c8a68b8 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -76,6 +76,8 @@ struct mempolicy {
>   * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
>   */
>  
> +extern struct mempolicy *mpol_new(int mode, nodemask_t *nodes);
> +
>  extern void __mpol_free(struct mempolicy *pol);
>  static inline void mpol_free(struct mempolicy *pol)
>  {
> @@ -164,6 +166,8 @@ static inline void check_highest_zone(enum zone_type k)
>  		policy_zone = k;
>  }
>  
> +extern unsigned interleave_nodes(struct mempolicy *policy);
> +
>  int do_migrate_pages(struct mm_struct *mm,
>  	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
>  
> @@ -179,6 +183,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  
>  #define mpol_set_vma_default(vma) do {} while(0)
>  
> +static inline struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
> +{
> +	return NULL;
> +}
> +
>  static inline void mpol_free(struct mempolicy *p)
>  {
>  }
> @@ -267,6 +276,11 @@ static inline int do_migrate_pages(struct mm_struct *mm,
>  static inline void check_highest_zone(int k)
>  {
>  }
> +
> +static inline unsigned interleave_nodes(struct mempolicy *policy)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_NUMA */
>  #endif /* __KERNEL__ */
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 858c0b3..1c13687 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -103,15 +103,20 @@ static void free_huge_page(struct page *page)
>  	spin_unlock(&hugetlb_lock);
>  }
>  
> -static int alloc_fresh_huge_page(void)
> +static int alloc_fresh_huge_page(struct mempolicy *policy)
>  {
> -	static int nid = 0;
> +	int nid;
>  	struct page *page;
> -	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
> -					HUGETLB_PAGE_ORDER);
> -	nid = next_node(nid, node_online_map);
> -	if (nid == MAX_NUMNODES)
> -		nid = first_node(node_online_map);
> +	int start_nid = interleave_nodes(policy);
> +
> +	nid = start_nid;
> +
> +	do {
> +		page = alloc_pages_node(nid,
> +				htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
> +				HUGETLB_PAGE_ORDER);
> +		nid = interleave_nodes(policy);

This needs to be:

		if (!page)
			nid = interleave_nodes(policy);

Otherwise, you skip every other populated node, because you call
interleave_nodes() at the top when you initialize start_nid.  You only
want to advance to the next node here if the allocation failed.

<snip>

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* [PATCH v7][UPDATE][RFC] Fix hugetlb pool allocation with empty nodes
  2007-06-13 21:04                             ` [PATCH v7][RFC] Fix hugetlb pool allocation with empty nodes Lee Schermerhorn
@ 2007-06-13 21:50                               ` Nishanth Aravamudan
  0 siblings, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 21:50 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: William Lee Irwin III, Christoph Lameter, anton, akpm, linux-mm

On 13.06.2007 [17:04:40 -0400], Lee Schermerhorn wrote:
> On Tue, 2007-06-12 at 17:04 -0700, Nishanth Aravamudan wrote:
> > On 12.06.2007 [12:13:47 -0700], William Lee Irwin III wrote:
> > > On 11.06.2007 [22:15:12 -0700], William Lee Irwin III wrote:
> > > >> For initially filling the pool one can just loop over nid's modulo the
> > > >> number of populated nodes and pass down a stack-allocated variable.
> > > 
> > > On Tue, Jun 12, 2007 at 10:45:03AM -0700, Nishanth Aravamudan wrote:
> > > > But how does one differentiate between "initally filling" the pool and a
> > > > later attempt to add to the pool (or even just marginally later).
> > > > I guess I don't see why folks are so against this static variable :) It
> > > > does the job and removing it seems like it could be an independent
> > > > cleanup?
> > > 
> > > Well, another approach is to just statically initialize it to something
> > > and then always check to make sure the node for the nid has memory, and
> > > if not, find the next nid with a node with memory from the populated map.
> > 
> > How does something like this look? Or is it overkill?
> > 
> > [PATCH 2.6.22-rc4-mm2] Fix hugetlb pool allocation with empty nodes V7
<snip>
> > +	do {
> > +		page = alloc_pages_node(nid,
> > +				htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
> > +				HUGETLB_PAGE_ORDER);
> > +		nid = interleave_nodes(policy);
> 
> This needs to be:
> 
> 		if (!page)
> 			nid = interleave_nodes(policy);
> 
> Otherwise, you skip every other populated node, because you call
> interleave_nodes() at the top when you initialize start_nid.  You only
> want to advance to the next node here if the allocation failed.

Nice catch, although I inverted it in my fix, so we're not checking page
twice for NULL-ness. Updated patch follows.

[PATCH 2.6.22-rc4-mm2] Fix hugetlb pool allocation with empty nodes V7

Anton found a problem with the hugetlb pool allocation when some nodes
have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee
worked on versions that tried to fix it, but none were accepted.
Christoph has created a set of patches which allow for GFP_THISNODE
allocations to fail if the node has no memory and for exporting a
node_memory_map indicating which nodes have memory. Since mempolicy.c
already has a number of functions which support interleaving, create a
mempolicy when we invoke alloc_fresh_huge_page() that specifies
interleaving across all the nodes in node_memory_map, rather than custom
interleaving code in hugetlb.c.  This requires adding some dummy
functions, and some declarations, in mempolicy.h to compile with NUMA or
!NUMA.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Andrew Morton <akpm@linux-foundation.org>

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 22b668c..c8a68b8 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -76,6 +76,8 @@ struct mempolicy {
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
  */
 
+extern struct mempolicy *mpol_new(int mode, nodemask_t *nodes);
+
 extern void __mpol_free(struct mempolicy *pol);
 static inline void mpol_free(struct mempolicy *pol)
 {
@@ -164,6 +166,8 @@ static inline void check_highest_zone(enum zone_type k)
 		policy_zone = k;
 }
 
+extern unsigned interleave_nodes(struct mempolicy *policy);
+
 int do_migrate_pages(struct mm_struct *mm,
 	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
 
@@ -179,6 +183,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 
 #define mpol_set_vma_default(vma) do {} while(0)
 
+static inline struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+{
+	return NULL;
+}
+
 static inline void mpol_free(struct mempolicy *p)
 {
 }
@@ -267,6 +276,11 @@ static inline int do_migrate_pages(struct mm_struct *mm,
 static inline void check_highest_zone(int k)
 {
 }
+
+static inline unsigned interleave_nodes(struct mempolicy *policy)
+{
+	return 0;
+}
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 858c0b3..88e1a30 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -103,15 +103,22 @@ static void free_huge_page(struct page *page)
 	spin_unlock(&hugetlb_lock);
 }
 
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct mempolicy *policy)
 {
-	static int nid = 0;
+	int nid;
 	struct page *page;
-	page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
-	nid = next_node(nid, node_online_map);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(node_online_map);
+	int start_nid = interleave_nodes(policy);
+
+	nid = start_nid;
+
+	do {
+		page = alloc_pages_node(nid,
+				htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
+				HUGETLB_PAGE_ORDER);
+		if (page)
+			break;
+		nid = interleave_nodes(policy);
+	} while (nid != start_nid);
 	if (page) {
 		set_compound_page_dtor(page, free_huge_page);
 		spin_lock(&hugetlb_lock);
@@ -153,6 +160,7 @@ fail:
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
+	struct mempolicy *pol;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
@@ -160,11 +168,16 @@ static int __init hugetlb_init(void)
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
+	pol = mpol_new(MPOL_INTERLEAVE, &node_memory_map);
+	if (IS_ERR(pol))
+		goto quit;
 	for (i = 0; i < max_huge_pages; ++i) {
-		if (!alloc_fresh_huge_page())
+		if (!alloc_fresh_huge_page(pol))
 			break;
 	}
+	mpol_free(pol);
 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+quit:
 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
 	return 0;
 }
@@ -232,10 +245,16 @@ static inline void try_to_free_low(unsigned long count)
 
 static unsigned long set_max_huge_pages(unsigned long count)
 {
+	struct mempolicy *pol;
+
+	pol = mpol_new(MPOL_INTERLEAVE, &node_memory_map);
+	if (IS_ERR(pol))
+		return nr_huge_pages;
 	while (count > nr_huge_pages) {
-		if (!alloc_fresh_huge_page())
-			return nr_huge_pages;
+		if (!alloc_fresh_huge_page(pol))
+			break;
 	}
+	mpol_free(pol);
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 21458ca..c576d32 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -171,7 +171,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
 }
 
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 
@@ -1121,7 +1121,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned nid, next;
 	struct task_struct *me = current;

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 15:30                                             ` Lee Schermerhorn
  2007-06-13 17:58                                               ` Nishanth Aravamudan
@ 2007-06-13 22:49                                               ` Christoph Lameter
  1 sibling, 0 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-13 22:49 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Wed, 13 Jun 2007, Lee Schermerhorn wrote:

> alloc_fresh_huge_page() loop.  However, I think that to support all
> platforms in a generic way, alloc_pages_node() and
> alloc_page_interleave() [both take a node id arg] should be more strict
> when the gfp mask includes 'THISNODE and not assume that a populated
> node always has on-node memory in the zone of interest.  E.g., something
> like:

So a node with memory may have no memory in that particular zone.

This can only be true for DMA and DMA32. So we need a node_has_dma(node)?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 17:58                                               ` Nishanth Aravamudan
  2007-06-13 18:21                                                 ` Lee Schermerhorn
@ 2007-06-13 22:50                                                 ` Christoph Lameter
  2007-06-13 23:09                                                   ` Nishanth Aravamudan
  2007-06-14 14:23                                                   ` Lee Schermerhorn
  1 sibling, 2 replies; 140+ messages in thread
From: Christoph Lameter @ 2007-06-13 22:50 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:

> I think your code above makes sense -- I'd still leave in the earlier
> check, though.
> 
> So it probably should be:
> 
> 	pgdat = NODE_DATA(nid);
> 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> 
> 	if (unlikely((gfp_mask & __GFP_THISNODE) &&
> 		(!node_memory(nid) ||
> 		 zonelist->zones[0]->zone_pgdat != pgdat)))
> 		 return NULL;
> 
> That way, if the node has no memory whatsoever, we don't bother checking
> the pgdat of the relevant zone?

Checking the pgdat is already done in __alloc_pages. No need to repeat it 
here.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 18:21                                                 ` Lee Schermerhorn
  2007-06-13 19:01                                                   ` Nishanth Aravamudan
@ 2007-06-13 22:51                                                   ` Christoph Lameter
  2007-06-14 15:50                                                     ` Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-13 22:51 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Wed, 13 Jun 2007, Lee Schermerhorn wrote:

> Yep.  I'm testing the stack "as is" now.  If it doesn't spread the huge
> pages evenly because of our funky DMA-only node, I'll post a fix up
> patch for consideration.

Note that the memory from your DMA only node is allocated without 
requiring DMA memory. We just fall back in the allocation to DMA memory.
Thus you do not need special handling as far as I can tell.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 22:50                                                 ` Christoph Lameter
@ 2007-06-13 23:09                                                   ` Nishanth Aravamudan
  2007-06-13 23:12                                                     ` Christoph Lameter
  2007-06-14 14:23                                                   ` Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 23:09 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On 13.06.2007 [15:50:41 -0700], Christoph Lameter wrote:
> On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:
> 
> > I think your code above makes sense -- I'd still leave in the earlier
> > check, though.
> > 
> > So it probably should be:
> > 
> > 	pgdat = NODE_DATA(nid);
> > 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> > 
> > 	if (unlikely((gfp_mask & __GFP_THISNODE) &&
> > 		(!node_memory(nid) ||
> > 		 zonelist->zones[0]->zone_pgdat != pgdat)))
> > 		 return NULL;
> > 
> > That way, if the node has no memory whatsoever, we don't bother checking
> > the pgdat of the relevant zone?
> 
> Checking the pgdat is already done in __alloc_pages. No need to repeat
> it here.

Except that check is broken in the same way it is for memoryless nodes,
right?

from get_page_from_freelist():

                if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))

Which asks if for this zone, is the first node the same as each node we look at
for THISNODE requests. But if the first node for the zone is a
*different* node, we still satisfy the request, but go off-node?

Just trying to see if that maybe is the problem here?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 23:09                                                   ` Nishanth Aravamudan
@ 2007-06-13 23:12                                                     ` Christoph Lameter
  2007-06-13 23:18                                                       ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-13 23:12 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:

> > > That way, if the node has no memory whatsoever, we don't bother checking
> > > the pgdat of the relevant zone?
> > 
> > Checking the pgdat is already done in __alloc_pages. No need to repeat
> > it here.
> 
> Except that check is broken in the same way it is for memoryless nodes,
> right?
> 
> from get_page_from_freelist():
> 
>                 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
>                         zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
> 
> Which asks if for this zone, is the first node the same as each node we look at
> for THISNODE requests. But if the first node for the zone is a
> *different* node, we still satisfy the request, but go off-node?
> 
> Just trying to see if that maybe is the problem here?

Right. But we do not have the pgdat pointer available in alloc_pages. Thus 
Lee's check works in alloc_pages_node(). Hmmm... This gets pretty 
difficult to comprehend. Maybe there is another easier way to implement 
GFP_THISNODE?

The breakage of SLUB makes it pretty evident that if GFP_THISNODE returns 
NULL for a memoryless node then lots of

for_each_online_node()

loops in the VM that assume that an online node contain memory are no 
longer working properly. We need to review the VM and convert those loops
to use the node_memory_map.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 23:12                                                     ` Christoph Lameter
@ 2007-06-13 23:18                                                       ` Nishanth Aravamudan
  2007-06-13 23:26                                                         ` Christoph Lameter
  0 siblings, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 23:18 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On 13.06.2007 [16:12:49 -0700], Christoph Lameter wrote:
> On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > > That way, if the node has no memory whatsoever, we don't bother checking
> > > > the pgdat of the relevant zone?
> > > 
> > > Checking the pgdat is already done in __alloc_pages. No need to repeat
> > > it here.
> > 
> > Except that check is broken in the same way it is for memoryless nodes,
> > right?
> > 
> > from get_page_from_freelist():
> > 
> >                 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
> >                         zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
> > 
> > Which asks if for this zone, is the first node the same as each node we look at
> > for THISNODE requests. But if the first node for the zone is a
> > *different* node, we still satisfy the request, but go off-node?
> > 
> > Just trying to see if that maybe is the problem here?
> 
> Right. But we do not have the pgdat pointer available in alloc_pages.
> Thus Lee's check works in alloc_pages_node().

Yep, exactly.

> Hmmm... This gets pretty difficult to comprehend. Maybe there is
> another easier way to implement GFP_THISNODE?

Well...maybe we can do better by just adding another GFP flag?

GFP_ONLYTHISNODE?

THISNODE has the current semantics, that the "closest" node is
preferred, which may be local, and it will succeed if memory exists
somewhere for the allocation you want (I think).

ONLYTHISNODE will return NULL if it has to go off-node for any reason.

> The breakage of SLUB makes it pretty evident that if GFP_THISNODE
> returns NULL for a memoryless node then lots of
> 
> for_each_online_node()
> 
> loops in the VM that assume that an online node contain memory are no
> longer working properly. We need to review the VM and convert those
> loops to use the node_memory_map.

That would avoid having to make these changes too.

Maybe with time, we can audit the users of THISNODE and move them over
to ONLYTHISNODE, as appropriate?

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 23:18                                                       ` Nishanth Aravamudan
@ 2007-06-13 23:26                                                         ` Christoph Lameter
  2007-06-13 23:56                                                           ` Nishanth Aravamudan
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-13 23:26 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:

> Well...maybe we can do better by just adding another GFP flag?
> 
> GFP_ONLYTHISNODE?
> 
> THISNODE has the current semantics, that the "closest" node is
> preferred, which may be local, and it will succeed if memory exists
> somewhere for the allocation you want (I think).

No we want one GFP_THISNODE working in a consistent way.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 23:26                                                         ` Christoph Lameter
@ 2007-06-13 23:56                                                           ` Nishanth Aravamudan
  0 siblings, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-13 23:56 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On 13.06.2007 [16:26:15 -0700], Christoph Lameter wrote:
> On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:
> 
> > Well...maybe we can do better by just adding another GFP flag?
> > 
> > GFP_ONLYTHISNODE?
> > 
> > THISNODE has the current semantics, that the "closest" node is
> > preferred, which may be local, and it will succeed if memory exists
> > somewhere for the allocation you want (I think).
> 
> No we want one GFP_THISNODE working in a consistent way.

Ok, I've started auditing things. I have a final exam tomorrow, however,
so probably won't make much progress before then.

I did notice that ia64/mm/discontig.c actually already tries to deal
with memoryless nodes, but all static to that file. See
memory_less_mask. Probably can be replaced via an inverted
node_memory_map.

Are you sure just the VM needs to be audited? I'm going to try the other
way around and look at GFP_THISNODE callers and go up from there.

Will let you know what I find.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 22:50                                                 ` Christoph Lameter
  2007-06-13 23:09                                                   ` Nishanth Aravamudan
@ 2007-06-14 14:23                                                   ` Lee Schermerhorn
  1 sibling, 0 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-14 14:23 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Wed, 2007-06-13 at 15:50 -0700, Christoph Lameter wrote:
> On Wed, 13 Jun 2007, Nishanth Aravamudan wrote:
> 
> > I think your code above makes sense -- I'd still leave in the earlier
> > check, though.
> > 
> > So it probably should be:
> > 
> > 	pgdat = NODE_DATA(nid);
> > 	zonelist = pgdat->node_zonelists + gfp_zone(gfp_mask);
> > 
> > 	if (unlikely((gfp_mask & __GFP_THISNODE) &&
> > 		(!node_memory(nid) ||
> > 		 zonelist->zones[0]->zone_pgdat != pgdat)))
> > 		 return NULL;
> > 
> > That way, if the node has no memory whatsoever, we don't bother checking
> > the pgdat of the relevant zone?
> 
> Checking the pgdat is already done in __alloc_pages. No need to repeat it 
> here.

As discussed in prior mail, that's too late given the check that's being
done.  Down in get_page_from_freelist(), where this check is made, we
don't have the node id nor pgdat from which the allocation is being
attempted.  The node id of the first zone in the list may already be
off-node [even tho' the node_memory_map says the node is populated/has
memory], so we can't rely on that.  

I suppose we could add the pgdat pointer to the zonelist itself or try
to backup to the original pgdat from the zonelist and
gfp_zone(gfp_mask).  Then we could do the check in
get_page_from_freelist() that each zone in the list is on-node.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-13 22:51                                                   ` Christoph Lameter
@ 2007-06-14 15:50                                                     ` Lee Schermerhorn
  2007-06-14 15:57                                                       ` Christoph Lameter
  2007-06-14 16:09                                                       ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-14 15:50 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Wed, 2007-06-13 at 15:51 -0700, Christoph Lameter wrote:
> On Wed, 13 Jun 2007, Lee Schermerhorn wrote:
> 
> > Yep.  I'm testing the stack "as is" now.  If it doesn't spread the huge
> > pages evenly because of our funky DMA-only node, I'll post a fix up
> > patch for consideration.
> 
> Note that the memory from your DMA only node is allocated without 
> requiring DMA memory. We just fall back in the allocation to DMA memory.
> Thus you do not need special handling as far as I can tell.

Just a note to clarify what was happening.  I already described the
zonelist selected by the gfp_zone for that node.  The first zone in the
list was on node 0, so everytime the interleave cursor specified node 4,
I got a page on node0.  I ended up with twice as many huge pages on node
0 as any other node.  

Nish's code also got the accounting wrong when he changed
"nr_huge_pages_node[page_to_nid(page)]++;" to
"nr_huge_pages_node[nid]++;" in his "numafy several functions" patch.
This caused the total/free counts to get out of sync and the total count
on node 0 to go negative when I free the pages.  This won't happen if
alloc_pages_node() never returns off-node pages.  

On my particular config [number of nodes/amount of memory], if the dma
zone had been first in the list [old, "node order" zonelists], the
allocation would have failed because no page of the requested order
would have been available there.  alloc_pages_node() would have failed
because get_page_from_freelist() would have detected that the 2nd zone
was off node and bailed out.  The "right thing" would have happened here
because of the order of the allocation.  Regular page allocations would
succeed and consume all of DMA--why we added "node order" zonelists.
Also, on a larger config, there would be more DMA memory, so a few [2 or
3?] huge pages might come from the dma memory.

It's even more complicated:  I can configure the platform so that more
of the memory from each of the real nodes in available in the
pseudo-node that contains memory that is hardware interleaved at the
cache-line granularity--all the way up to 100% interleaved.   At 100%
interleaved, all of the real nodes become "memoryless" and all memory
exists in the pseudo-node.   Up to the 1st 4GB will be in zone DMA and
the remainder in zone NORMAL.  This interleaved memory has different
latency/bandwidth properties from node local memory, so in a mixed
local/interleaved configuration, I'd like to handle it separately--e.g.,
not automatically used for task memory interleaving.  It will never be
"local" to any cpu, so default policy won't allocate there.  I'd love to
make the default page cache policy prefer that node, or use it for a
data base shared global area, ...  

The point of all this is that, as you've pointed out, the original NUMA
and memory policy designs assumed a fairly symmetric system
configuration with all nodes populated with [similar amounts?] of
roughly equivalent memory.  That probably describes a majority of NUMA
systems, so the system should handle this well, as a default.  We still
need to be able to handle the less symmetric configs--with boot
parameters, sysctls, cpusets, ...--that specify non-default behavior,
and cause the generic code to do the right thing.  Certainly, the
generic code can't "fall over and die" in the presence of memoryless
nodes or other "interesting" configurations.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 15:50                                                     ` Lee Schermerhorn
@ 2007-06-14 15:57                                                       ` Christoph Lameter
  2007-06-14 16:54                                                         ` Lee Schermerhorn
  2007-06-14 16:09                                                       ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-14 15:57 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Thu, 14 Jun 2007, Lee Schermerhorn wrote:

> The point of all this is that, as you've pointed out, the original NUMA
> and memory policy designs assumed a fairly symmetric system
> configuration with all nodes populated with [similar amounts?] of
> roughly equivalent memory.  That probably describes a majority of NUMA
> systems, so the system should handle this well, as a default.  We still
> need to be able to handle the less symmetric configs--with boot
> parameters, sysctls, cpusets, ...--that specify non-default behavior,
> and cause the generic code to do the right thing.  Certainly, the
> generic code can't "fall over and die" in the presence of memoryless
> nodes or other "interesting" configurations.

The hugepage distribution issues have to be handled by the hugepage code. 
There is no point in adding inconsistencies in the definition of a 
memoryless node to satisfy hugepage distribution issues on one platform.

The memoryless node handling addresses one particular assymmetry: No 
memory vs. some memory. The fine grained stuff that relates to particular 
page types (like I do not want hugepages on my DMA node...) have to be 
handled by the management of that particular page type. Here we need some 
control over huge page distribution. There is already another case where 
we may need to control the nodes that slab uses for its allocations. The 
slab node restrictions have to be handled by the slab code. Same thing for 
hugepages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 15:50                                                     ` Lee Schermerhorn
  2007-06-14 15:57                                                       ` Christoph Lameter
@ 2007-06-14 16:09                                                       ` Nishanth Aravamudan
  2007-06-14 16:15                                                         ` Christoph Lameter
  1 sibling, 1 reply; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-14 16:09 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm

On 14.06.2007 [11:50:47 -0400], Lee Schermerhorn wrote:
> On Wed, 2007-06-13 at 15:51 -0700, Christoph Lameter wrote:
> > On Wed, 13 Jun 2007, Lee Schermerhorn wrote:
> > 
> > > Yep.  I'm testing the stack "as is" now.  If it doesn't spread the huge
> > > pages evenly because of our funky DMA-only node, I'll post a fix up
> > > patch for consideration.
> > 
> > Note that the memory from your DMA only node is allocated without 
> > requiring DMA memory. We just fall back in the allocation to DMA memory.
> > Thus you do not need special handling as far as I can tell.
> 
> Just a note to clarify what was happening.  I already described the
> zonelist selected by the gfp_zone for that node.  The first zone in
> the list was on node 0, so everytime the interleave cursor specified
> node 4, I got a page on node0.  I ended up with twice as many huge
> pages on node 0 as any other node.  
> 
> Nish's code also got the accounting wrong when he changed
> "nr_huge_pages_node[page_to_nid(page)]++;" to
> "nr_huge_pages_node[nid]++;" in his "numafy several functions" patch.
> This caused the total/free counts to get out of sync and the total
> count on node 0 to go negative when I free the pages.  This won't
> happen if alloc_pages_node() never returns off-node pages.  

Yep, that last sentence is the key. Regardless of NUMA layout, I would
like to rely (and I believe these are the semantics we are striving for)
on GFP_THISNODE allocations only returning pages on the node. Perhaps
we should add some WARN_ON()'s to the VM so any modifications that break
this assumption will be detected quickly? e.g.

	WARN_ON(page_to_nid(page) != nid)

<snip>

> The point of all this is that, as you've pointed out, the original
> NUMA and memory policy designs assumed a fairly symmetric system
> configuration with all nodes populated with [similar amounts?] of
> roughly equivalent memory.  That probably describes a majority of NUMA
> systems, so the system should handle this well, as a default.  We
> still need to be able to handle the less symmetric configs--with boot
> parameters, sysctls, cpusets, ...--that specify non-default behavior,
> and cause the generic code to do the right thing.  Certainly, the
> generic code can't "fall over and die" in the presence of memoryless
> nodes or other "interesting" configurations.

Agreed,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 16:09                                                       ` Nishanth Aravamudan
@ 2007-06-14 16:15                                                         ` Christoph Lameter
  2007-06-14 17:07                                                           ` Lee Schermerhorn
  0 siblings, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-14 16:15 UTC (permalink / raw)
  To: Nishanth Aravamudan; +Cc: Lee Schermerhorn, anton, akpm, linux-mm

On Thu, 14 Jun 2007, Nishanth Aravamudan wrote:

> > The point of all this is that, as you've pointed out, the original
> > NUMA and memory policy designs assumed a fairly symmetric system
> > configuration with all nodes populated with [similar amounts?] of
> > roughly equivalent memory.  That probably describes a majority of NUMA
> > systems, so the system should handle this well, as a default.  We
> > still need to be able to handle the less symmetric configs--with boot
> > parameters, sysctls, cpusets, ...--that specify non-default behavior,
> > and cause the generic code to do the right thing.  Certainly, the
> > generic code can't "fall over and die" in the presence of memoryless
> > nodes or other "interesting" configurations.
> 
> Agreed,
> Nish

The generic code currently does not fail. It (slab allocators etc) simply 
gets memory that it thinks comes from a memoryless node but it came from a 
neighboring node.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 15:57                                                       ` Christoph Lameter
@ 2007-06-14 16:54                                                         ` Lee Schermerhorn
  0 siblings, 0 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-14 16:54 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Thu, 2007-06-14 at 08:57 -0700, Christoph Lameter wrote:
> On Thu, 14 Jun 2007, Lee Schermerhorn wrote:
> 
> > The point of all this is that, as you've pointed out, the original NUMA
> > and memory policy designs assumed a fairly symmetric system
> > configuration with all nodes populated with [similar amounts?] of
> > roughly equivalent memory.  That probably describes a majority of NUMA
> > systems, so the system should handle this well, as a default.  We still
> > need to be able to handle the less symmetric configs--with boot
> > parameters, sysctls, cpusets, ...--that specify non-default behavior,
> > and cause the generic code to do the right thing.  Certainly, the
> > generic code can't "fall over and die" in the presence of memoryless
> > nodes or other "interesting" configurations.
> 
> The hugepage distribution issues have to be handled by the hugepage code. 
> There is no point in adding inconsistencies in the definition of a 
> memoryless node to satisfy hugepage distribution issues on one platform.

I don't disagree.  I originally tried to fix this in the hugetlb
allocation code.  But, I was using zonelist internal knowledge [ensuring
that the first zone was on-node], but I recall that both you and Nish
didn't like this--huge page code having knowledge of zonelist internals.
That led me off to defining a node_populated_map that had the right
semantics for hugetlb fresh page allocation [for my platform, anyway].
Then, we started using the node_populated_map for other things and it
evolved to where alloc_pages_node() can leak off-node pages for some
platforms [mine :-(].  


> The memoryless node handling addresses one particular assymmetry: No 
> memory vs. some memory. The fine grained stuff that relates to particular 
> page types (like I do not want hugepages on my DMA node...) have to be 
> handled by the management of that particular page type. Here we need some 
> control over huge page distribution. There is already another case where 
> we may need to control the nodes that slab uses for its allocations. The 
> slab node restrictions have to be handled by the slab code. Same thing for 
> hugepages.
> 

If we agree that I can filter off-node pages in
alloc_fresh_huge_page_node(), freeing a page and returning NULL if it's
off-node, that will solve the problem for huge page setup.  

I'll try that atop your latest patch stream, once Nish has reposted his
huge page allocation set.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 16:15                                                         ` Christoph Lameter
@ 2007-06-14 17:07                                                           ` Lee Schermerhorn
  2007-06-14 17:16                                                             ` Christoph Lameter
  2007-06-14 22:35                                                             ` Nishanth Aravamudan
  0 siblings, 2 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-14 17:07 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Thu, 2007-06-14 at 09:15 -0700, Christoph Lameter wrote:
> On Thu, 14 Jun 2007, Nishanth Aravamudan wrote:
> 
> > > The point of all this is that, as you've pointed out, the original
> > > NUMA and memory policy designs assumed a fairly symmetric system
> > > configuration with all nodes populated with [similar amounts?] of
> > > roughly equivalent memory.  That probably describes a majority of NUMA
> > > systems, so the system should handle this well, as a default.  We
> > > still need to be able to handle the less symmetric configs--with boot
> > > parameters, sysctls, cpusets, ...--that specify non-default behavior,
> > > and cause the generic code to do the right thing.  Certainly, the
> > > generic code can't "fall over and die" in the presence of memoryless
> > > nodes or other "interesting" configurations.
> > 
> > Agreed,
> > Nish
> 
> The generic code currently does not fail. It (slab allocators etc) simply 
> gets memory that it thinks comes from a memoryless node but it came from a 
> neighboring node.
> 

If it (slab allocators etc) wants and/or can use memory from a different
node from what it requested, then, it shouldn't be calling with
GFP_THISNODE, right?  I mean what's the point?  If GFP_THISNODE never
returned off-node memory, then one couldn't use it without checking for
and dealing with failure.  And, 'THISNODE allocations CAN fail, when the
first zone in the selected zonelist is empty and subsequent zones are
off-node.  __alloc_pages() et al WILL fail this case and return NULL, so
callers must be prepared to deal with it--even [especially?] early boot
code, IMO, anyway.

Again, I'll try your latest stack with Nish's patches and see what
happens.

Lee




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 17:07                                                           ` Lee Schermerhorn
@ 2007-06-14 17:16                                                             ` Christoph Lameter
  2007-06-14 18:04                                                               ` Lee Schermerhorn
  2007-06-14 22:35                                                             ` Nishanth Aravamudan
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-06-14 17:16 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Thu, 14 Jun 2007, Lee Schermerhorn wrote:

> If it (slab allocators etc) wants and/or can use memory from a different
> node from what it requested, then, it shouldn't be calling with
> GFP_THISNODE, right?  I mean what's the point?  If GFP_THISNODE never

The code wanted memory from a certain node because a certain structure is 
performance sensitive and it did get something else. Both slab and slub 
will fail at some point when trying to touch the structure that was not 
allocated.

> returned off-node memory, then one couldn't use it without checking for
> and dealing with failure.  And, 'THISNODE allocations CAN fail, when the

GFP_THISNODE *never* should return off node memory. That it happened is 
due to people not reviewing the VM as I told them to when we starting 
allowing memoryless nodes in the core VM.

> first zone in the selected zonelist is empty and subsequent zones are
> off-node.  __alloc_pages() et al WILL fail this case and return NULL, so
> callers must be prepared to deal with it--even [especially?] early boot
> code, IMO, anyway.

Bootstrap is a special case. It is a reasonable expectation to find memory 
on nodes that have memory (i.e. formerly online nodes were guaranteed to 
have memory now we guarantee that for "memory nodes").

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 17:16                                                             ` Christoph Lameter
@ 2007-06-14 18:04                                                               ` Lee Schermerhorn
  0 siblings, 0 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-06-14 18:04 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Nishanth Aravamudan, anton, akpm, linux-mm

On Thu, 2007-06-14 at 10:16 -0700, Christoph Lameter wrote:
> On Thu, 14 Jun 2007, Lee Schermerhorn wrote:
> 
> > If it (slab allocators etc) wants and/or can use memory from a different
> > node from what it requested, then, it shouldn't be calling with
> > GFP_THISNODE, right?  I mean what's the point?  If GFP_THISNODE never
> 
> The code wanted memory from a certain node because a certain structure is 
> performance sensitive and it did get something else. 

Yes, and if they're fine with that, why did they need to specify
'THISNODE.  They called alloc_pages_node() which effectively "prefers"
the specified node without the 'THISNODE flag.  If they're willing to
fallback to off-node allocations, just drop the THISNODE flag...

> Both slab and slub 
> will fail at some point when trying to touch the structure that was not 
> allocated.

Because they failed to check the return from an allocation?

> 
> > returned off-node memory, then one couldn't use it without checking for
> > and dealing with failure.  And, 'THISNODE allocations CAN fail, when the
> 
> GFP_THISNODE *never* should return off node memory. 

I agree!  But the current [generic] code can and will for some hardware
configurations and zonelist order.

> That it happened is 
> due to people not reviewing the VM as I told them to when we starting 
> allowing memoryless nodes in the core VM.

Certainly, more reviewing is a good thing.  My review shows that an
allocation with 'THISNODE specified WILL return off-node memory if the
specified zonelist has an off-node zone in the first slot and some
on-node memory in lower zones later in the list [node zonelist order].
This configuration does occur.  

I think the disconnect is in whether we want THISNODE allocations to
attempt to look past any higher off-zone nodes and return on-node memory
from a lower [DMA/32] zone.  Sound's like this is what you want, but the
current implementation doesn't do that either, when the zonelists are in
node order.  It returns off-node memory for the first zone in the list,
if any.  If the allocation can't be satisfied from the first zone, it
looks at the next.  If the second or subsequent zones are off-node, it
WILL fail, and never get to the on-node DMA/32 zone.   

Idea:  in get_page_from_freelist() [called from __alloc_pages()]:

1) enhance to add check that first zone in zonelist is also
on-node--something it doesn't do now.

2) instead of bailing out when 'THISNODE is set and we encounter an
off-node zone, keep scanning the zonelist for on-node zones.  Bail out
only if we hit the end of the list and haven't satisfied the request.

This will handle zonelists in node order with no local memory in the
requested zone.  'THISNODE allocations can still fail, once the lower
order local zone is exhausted, but at least it won't return off-node
memory.

I'll still have an issue with huge pages coming from the DMA zone for
some configurations, but I can look at tackling that from another
direction.

> 
> > first zone in the selected zonelist is empty and subsequent zones are
> > off-node.  __alloc_pages() et al WILL fail this case and return NULL, so
> > callers must be prepared to deal with it--even [especially?] early boot
> > code, IMO, anyway.
> 
> Bootstrap is a special case. It is a reasonable expectation to find memory 
> on nodes that have memory (i.e. formerly online nodes were guaranteed to 
> have memory now we guarantee that for "memory nodes").

Whether that expectation is reasonable seems to be configuration
dependent.  I still think you should be able to handle allocation
failures when setting up slub caches at boot time.  A BUG_ON in the boot
path is very unfriendly--system stops dead.  Just treat the node as
memoryless from the slab/slub viewpoint if the allocation fails; or
explicitly allocate slab/slub resources from a nearby node by dropping
the 'THISNODE' [if THISNODE behaved consistently...] and retrying.  Just
don't freeze up the machine.

Later,
Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH] populated_map: fix !NUMA case, remove comment
  2007-06-14 17:07                                                           ` Lee Schermerhorn
  2007-06-14 17:16                                                             ` Christoph Lameter
@ 2007-06-14 22:35                                                             ` Nishanth Aravamudan
  1 sibling, 0 replies; 140+ messages in thread
From: Nishanth Aravamudan @ 2007-06-14 22:35 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Christoph Lameter, anton, akpm, linux-mm

On 14.06.2007 [13:07:52 -0400], Lee Schermerhorn wrote:
> On Thu, 2007-06-14 at 09:15 -0700, Christoph Lameter wrote:
> > On Thu, 14 Jun 2007, Nishanth Aravamudan wrote:
> > 
> > > > The point of all this is that, as you've pointed out, the original
> > > > NUMA and memory policy designs assumed a fairly symmetric system
> > > > configuration with all nodes populated with [similar amounts?] of
> > > > roughly equivalent memory.  That probably describes a majority of NUMA
> > > > systems, so the system should handle this well, as a default.  We
> > > > still need to be able to handle the less symmetric configs--with boot
> > > > parameters, sysctls, cpusets, ...--that specify non-default behavior,
> > > > and cause the generic code to do the right thing.  Certainly, the
> > > > generic code can't "fall over and die" in the presence of memoryless
> > > > nodes or other "interesting" configurations.
> > > 
> > > Agreed,
> > > Nish
> > 
> > The generic code currently does not fail. It (slab allocators etc) simply 
> > gets memory that it thinks comes from a memoryless node but it came from a 
> > neighboring node.
> > 
> 
> If it (slab allocators etc) wants and/or can use memory from a
> different node from what it requested, then, it shouldn't be calling
> with GFP_THISNODE, right?  I mean what's the point?  If GFP_THISNODE
> never returned off-node memory, then one couldn't use it without
> checking for and dealing with failure.  And, 'THISNODE allocations CAN
> fail, when the first zone in the selected zonelist is empty and
> subsequent zones are off-node.  __alloc_pages() et al WILL fail this
> case and return NULL, so callers must be prepared to deal with
> it--even [especially?] early boot code, IMO, anyway.
> 
> Again, I'll try your latest stack with Nish's patches and see what
> happens.

The three latest versions of my patches appear to apply cleanly on top
of Christoph's set of 10.

Am starting testing now.

Thanks,
Nish

-- 
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v4][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-06-13 20:05                                     ` Lee Schermerhorn
  2007-06-13 20:29                                       ` Nishanth Aravamudan
@ 2007-07-23 19:23                                       ` Christoph Lameter
  2007-07-23 20:14                                         ` Lee Schermerhorn
  1 sibling, 1 reply; 140+ messages in thread
From: Christoph Lameter @ 2007-07-23 19:23 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Nishanth Aravamudan, William Lee Irwin III, anton, akpm, linux-mm

On Wed, 13 Jun 2007 16:05:10 -0400
Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:

> I tried to "tighten up"  alloc_pages_node() to check the location of
> the first zone in the selected zonelist, as discussed in previous
> exchange. When I do this, I hit a BUG() in slub.c in
> early_kmem_cache_node_alloc(), as it apparently can't handle
> new_slab() returning a NULL page, even tho' it calls it with
> GFP_THISNODE.  Slub should be able to handle memoryless nodes,
> right?  I'm looking for a work around to this now.

The memoryless node patchset results in SLUB not attempting to allocate
on memoryless nodes during bootstrap.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

* Re: [PATCH v4][RFC] hugetlb: add per-node nr_hugepages sysfs attribute
  2007-07-23 19:23                                       ` Christoph Lameter
@ 2007-07-23 20:14                                         ` Lee Schermerhorn
  0 siblings, 0 replies; 140+ messages in thread
From: Lee Schermerhorn @ 2007-07-23 20:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nishanth Aravamudan, William Lee Irwin III, anton, akpm, linux-mm

On Mon, 2007-07-23 at 12:23 -0700, Christoph Lameter wrote:
> On Wed, 13 Jun 2007 16:05:10 -0400
> Lee Schermerhorn <Lee.Schermerhorn@hp.com> wrote:
> 
> > I tried to "tighten up"  alloc_pages_node() to check the location of
> > the first zone in the selected zonelist, as discussed in previous
> > exchange. When I do this, I hit a BUG() in slub.c in
> > early_kmem_cache_node_alloc(), as it apparently can't handle
> > new_slab() returning a NULL page, even tho' it calls it with
> > GFP_THISNODE.  Slub should be able to handle memoryless nodes,
> > right?  I'm looking for a work around to this now.
> 
> The memoryless node patchset results in SLUB not attempting to allocate
> on memoryless nodes during bootstrap.
> 

Christoph:

The message that you're responding to is from 13jun, before your
memoryless nodes patch.  We discussed it and have more or less resolved
it.  I was trying to ensure that GFP_THISNODE would fail on my funky
interleaved node with just DMA memory, when you ask for a higher zone.
I.e., no fallback.  You disagreed with this, so I'm waiting for the
memoryless nodes patches to get into -mm, so I can address the issue of
hugepages [and regular interleaved pages] being allocated from a node
where they shouldn't on my platform.  

This has been discussed in the past week by Nish, Paul Mundt, and others
in the -mm thread:

	[hugetlb] Try to grow pool for MAP_SHARED mappings

I think we can handle the fundamental issue [even nodes with memory are
not necessarily candidates for interleave, hugepages, ...] by adding
another node_state[].  See the mentioned thread.

Lee

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 140+ messages in thread

end of thread, other threads:[~2007-07-23 20:14 UTC | newest]

Thread overview: 140+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-06-11 20:27 [PATCH] Add populated_map to account for memoryless nodes Nishanth Aravamudan, Lee Schermerhorn
2007-06-11 21:25 ` Christoph Lameter
2007-06-11 22:10   ` [PATCH v2] " Nishanth Aravamudan
2007-06-11 22:42     ` Christoph Lameter
2007-06-11 22:52       ` [PATCH v3] " Nishanth Aravamudan
2007-06-11 23:00         ` Christoph Lameter
2007-06-11 23:41           ` [PATCH v4] " Nishanth Aravamudan
2007-06-11 23:45             ` Christoph Lameter
2007-06-12  0:07               ` [PATCH] populated_map: fix !NUMA case, remove comment Nishanth Aravamudan
2007-06-12  0:41                 ` Christoph Lameter
2007-06-12  1:43                   ` Nishanth Aravamudan
2007-06-12  1:45                     ` Christoph Lameter
2007-06-12  1:52                       ` Nishanth Aravamudan
2007-06-12  2:39                       ` Nishanth Aravamudan
2007-06-12  2:02                   ` Nishanth Aravamudan
2007-06-12  2:20                     ` Christoph Lameter
2007-06-12  2:32                       ` Nishanth Aravamudan
2007-06-12  2:54                         ` Christoph Lameter
2007-06-12  3:20                           ` Nishanth Aravamudan
2007-06-12  3:21                             ` Christoph Lameter
2007-06-12  3:31                               ` Nishanth Aravamudan
2007-06-12 15:06                             ` Lee Schermerhorn
2007-06-12 17:28                               ` Nishanth Aravamudan
2007-06-12 18:43                                 ` Christoph Lameter
2007-06-12 18:48                                 ` Lee Schermerhorn
2007-06-12 18:51                                   ` Christoph Lameter
2007-06-12 19:44                                     ` Lee Schermerhorn
2007-06-12 19:48                                       ` Christoph Lameter
2007-06-12 19:58                                         ` Christoph Lameter
2007-06-12 20:01                                           ` Nishanth Aravamudan
2007-06-13 15:30                                             ` Lee Schermerhorn
2007-06-13 17:58                                               ` Nishanth Aravamudan
2007-06-13 18:21                                                 ` Lee Schermerhorn
2007-06-13 19:01                                                   ` Nishanth Aravamudan
2007-06-13 22:51                                                   ` Christoph Lameter
2007-06-14 15:50                                                     ` Lee Schermerhorn
2007-06-14 15:57                                                       ` Christoph Lameter
2007-06-14 16:54                                                         ` Lee Schermerhorn
2007-06-14 16:09                                                       ` Nishanth Aravamudan
2007-06-14 16:15                                                         ` Christoph Lameter
2007-06-14 17:07                                                           ` Lee Schermerhorn
2007-06-14 17:16                                                             ` Christoph Lameter
2007-06-14 18:04                                                               ` Lee Schermerhorn
2007-06-14 22:35                                                             ` Nishanth Aravamudan
2007-06-13 22:50                                                 ` Christoph Lameter
2007-06-13 23:09                                                   ` Nishanth Aravamudan
2007-06-13 23:12                                                     ` Christoph Lameter
2007-06-13 23:18                                                       ` Nishanth Aravamudan
2007-06-13 23:26                                                         ` Christoph Lameter
2007-06-13 23:56                                                           ` Nishanth Aravamudan
2007-06-14 14:23                                                   ` Lee Schermerhorn
2007-06-13 22:49                                               ` Christoph Lameter
2007-06-12 19:55                                       ` Nishanth Aravamudan
2007-06-12 18:41                               ` Christoph Lameter
2007-06-12 19:07                                 ` Lee Schermerhorn
2007-06-12 19:13                                   ` Christoph Lameter
2007-06-11 23:08         ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Nishanth Aravamudan
2007-06-11 23:10           ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
2007-06-11 23:11             ` [PATCH][RFC] hugetlb: numafy several functions Nishanth Aravamudan
2007-06-11 23:13               ` [PATCH][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
2007-06-11 23:40                 ` Christoph Lameter
2007-06-11 23:42                 ` Christoph Lameter
2007-06-12  0:19                   ` Nishanth Aravamudan
2007-06-12  0:43                     ` Christoph Lameter
2007-06-12  2:19                   ` Nishanth Aravamudan
2007-06-12  2:22                     ` Christoph Lameter
2007-06-12  2:34                       ` Nishanth Aravamudan
2007-06-11 23:38               ` [PATCH][RFC] hugetlb: numafy several functions Christoph Lameter
2007-06-11 23:17             ` [PATCH v6][RFC] Fix hugetlb pool allocation with empty nodes Christoph Lameter
2007-06-12  0:15               ` Nishanth Aravamudan
2007-06-12  0:47                 ` Christoph Lameter
2007-06-12  2:12                   ` Nishanth Aravamudan
2007-06-12  2:21                     ` Christoph Lameter
2007-06-12  2:25                       ` Christoph Lameter
2007-06-12  2:34                         ` Nishanth Aravamudan
2007-06-12  2:55                           ` Christoph Lameter
2007-06-12  3:17                             ` Nishanth Aravamudan
2007-06-12  3:19                               ` Christoph Lameter
2007-06-12  3:30                                 ` Nishanth Aravamudan
2007-06-12  3:48                                   ` Christoph Lameter
2007-06-12  5:07                                     ` Nishanth Aravamudan
2007-06-12 18:47                                       ` Christoph Lameter
2007-06-12 17:43                                     ` Nishanth Aravamudan
2007-06-12 18:49                                       ` Christoph Lameter
2007-06-12  2:33                       ` Nishanth Aravamudan
2007-06-12  3:44                 ` William Lee Irwin III
2007-06-12  3:50                   ` Christoph Lameter
2007-06-12  3:53                     ` William Lee Irwin III
2007-06-12  3:53                       ` Christoph Lameter
2007-06-12  4:14                         ` William Lee Irwin III
2007-06-12  5:09                   ` Nishanth Aravamudan
2007-06-12  5:15                     ` William Lee Irwin III
2007-06-12 17:36                       ` Nishanth Aravamudan
2007-06-12 18:50                         ` Christoph Lameter
2007-06-12 17:45                       ` Nishanth Aravamudan
2007-06-12 19:13                         ` William Lee Irwin III
2007-06-13  0:04                           ` [PATCH v7][RFC] " Nishanth Aravamudan
2007-06-13 15:26                             ` [PATCH v3][RFC] hugetlb: numafy several functions Nishanth Aravamudan
2007-06-13 15:28                               ` [PATCH v3][RFC] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
2007-06-13 18:23                                 ` Lee Schermerhorn
2007-06-13 19:19                                   ` [PATCH v4][RFC] " Nishanth Aravamudan
2007-06-13 20:05                                     ` Lee Schermerhorn
2007-06-13 20:29                                       ` Nishanth Aravamudan
2007-06-13 21:02                                         ` Lee Schermerhorn
2007-07-23 19:23                                       ` Christoph Lameter
2007-07-23 20:14                                         ` Lee Schermerhorn
2007-06-13 21:04                             ` [PATCH v7][RFC] Fix hugetlb pool allocation with empty nodes Lee Schermerhorn
2007-06-13 21:50                               ` [PATCH v7][UPDATE][RFC] " Nishanth Aravamudan
2007-06-12 14:28               ` [PATCH v6][RFC] " Lee Schermerhorn
2007-06-11 23:15           ` [PATCH][RFC] Fix INTERLEAVE with memoryless nodes Christoph Lameter
2007-06-12  0:14             ` [PATCH v2][RFC] " Nishanth Aravamudan
2007-06-12  0:42               ` Christoph Lameter
2007-06-12  0:57               ` Andrew Morton
2007-06-12  1:12                 ` Christoph Lameter
2007-06-12  1:41                 ` Nishanth Aravamudan
2007-06-12  1:52                   ` Andrew Morton
2007-06-12  2:03                     ` Nishanth Aravamudan
2007-06-12 14:19       ` [PATCH v2] Add populated_map to account for " Lee Schermerhorn
2007-06-12 17:32         ` Nishanth Aravamudan
2007-06-12 18:45         ` Christoph Lameter
2007-06-12 19:17           ` Lee Schermerhorn
2007-06-12 19:22             ` Christoph Lameter
2007-06-12 19:49               ` Nishanth Aravamudan
2007-06-12 19:51                 ` Christoph Lameter
2007-06-12 20:00                   ` Nishanth Aravamudan
2007-06-12 20:03                     ` Christoph Lameter
2007-06-12 20:10                     ` Christoph Lameter
2007-06-12 19:52                 ` Christoph Lameter
2007-06-12 19:58                   ` Christoph Lameter
2007-06-12 20:00                   ` Nishanth Aravamudan
2007-06-12 20:06                     ` Christoph Lameter
2007-06-12 14:10   ` [PATCH] " Lee Schermerhorn
2007-06-12 17:35     ` Nishanth Aravamudan
2007-06-12 18:39       ` Christoph Lameter
2007-06-12 18:54         ` Lee Schermerhorn
2007-06-12 19:00           ` Christoph Lameter
2007-06-12  2:27 ` KAMEZAWA Hiroyuki
2007-06-12  2:46   ` Nishanth Aravamudan
2007-06-12  2:53   ` Christoph Lameter
2007-06-12  3:04     ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox