* [PATCH v8] Fix hugetlb pool allocation with empty nodes
@ 2007-07-14 20:37 Nishanth Aravamudan
2007-07-14 20:41 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
2007-07-14 20:41 ` [PATCH v8] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
0 siblings, 2 replies; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-07-14 20:37 UTC (permalink / raw)
To: clameter; +Cc: anton, lee.schermerhorn, wli, kxr, akpm, linux-mm
Fix hugetlb pool allocation with empty nodes
Anton found a problem with the hugetlb pool allocation when some nodes
have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee
worked on versions that tried to fix it, but none were accepted.
Christoph has created a set of patches which allow for GFP_THISNODE
allocations to fail if the node has no memory and for exporting a
node_memory_map indicating which nodes have memory. Since mempolicy.c
already has a number of functions which support interleaving, create a
mempolicy when we invoke alloc_fresh_huge_page() that specifies
interleaving across all the nodes in node_memory_map, rather than custom
interleaving code in hugetlb.c. This requires adding some dummy
functions, and some declarations, in mempolicy.h to compile with NUMA or
!NUMA.
Compile tested on x86, x86_64 and ppc64. Run tested on 4-node x86-64 (no
memoryless nodes), non-NUMA x86 and 4-node ppc64 (2 memoryless nodes).
Depends on Christoph's memoryless node patch stack to guarantee THISNODE
allocations stay on the requested node.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Keith Rich <kxr@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
include/linux/mempolicy.h | 14 ++++++++++++++
mm/hugetlb.c | 39 +++++++++++++++++++++++++++++----------
mm/mempolicy.c | 4 ++--
3 files changed, 45 insertions(+), 12 deletions(-)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 9e1734a..6d7099c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -76,6 +76,8 @@ struct mempolicy {
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
*/
+extern struct mempolicy *mpol_new(int mode, nodemask_t *nodes);
+
extern void __mpol_free(struct mempolicy *pol);
static inline void mpol_free(struct mempolicy *pol)
{
@@ -162,6 +164,8 @@ static inline void check_highest_zone(enum zone_type k)
policy_zone = k;
}
+extern unsigned interleave_nodes(struct mempolicy *policy);
+
int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
@@ -177,6 +181,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
#define mpol_set_vma_default(vma) do {} while(0)
+static inline struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+{
+ return NULL;
+}
+
static inline void mpol_free(struct mempolicy *p)
{
}
@@ -259,6 +268,11 @@ static inline int do_migrate_pages(struct mm_struct *mm,
static inline void check_highest_zone(int k)
{
}
+
+static inline unsigned interleave_nodes(struct mempolicy *policy)
+{
+ return 0;
+}
#endif /* CONFIG_NUMA */
#endif /* __KERNEL__ */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 858c0b3..1cd3118 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -103,15 +103,22 @@ static void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct mempolicy *policy)
{
- static int nid = 0;
+ int nid;
struct page *page;
- page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
+ int start_nid = interleave_nodes(policy);
+
+ nid = start_nid;
+
+ do {
+ page = alloc_pages_node(nid,
+ htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
+ HUGETLB_PAGE_ORDER);
+ if (page)
+ break;
+ nid = interleave_nodes(policy);
+ } while (nid != start_nid);
if (page) {
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
@@ -153,6 +160,7 @@ fail:
static int __init hugetlb_init(void)
{
unsigned long i;
+ struct mempolicy *pol;
if (HPAGE_SHIFT == 0)
return 0;
@@ -160,11 +168,16 @@ static int __init hugetlb_init(void)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&hugepage_freelists[i]);
+ pol = mpol_new(MPOL_INTERLEAVE, &node_states[N_MEMORY]);
+ if (IS_ERR(pol))
+ goto quit;
for (i = 0; i < max_huge_pages; ++i) {
- if (!alloc_fresh_huge_page())
+ if (!alloc_fresh_huge_page(pol))
break;
}
+ mpol_free(pol);
max_huge_pages = free_huge_pages = nr_huge_pages = i;
+quit:
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
return 0;
}
@@ -232,10 +245,16 @@ static inline void try_to_free_low(unsigned long count)
static unsigned long set_max_huge_pages(unsigned long count)
{
+ struct mempolicy *pol;
+
+ pol = mpol_new(MPOL_INTERLEAVE, &node_states[N_MEMORY]);
+ if (IS_ERR(pol))
+ return nr_huge_pages;
while (count > nr_huge_pages) {
- if (!alloc_fresh_huge_page())
- return nr_huge_pages;
+ if (!alloc_fresh_huge_page(pol))
+ break;
}
+ mpol_free(pol);
if (count >= nr_huge_pages)
return nr_huge_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d401414..6ccd658 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -169,7 +169,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
}
/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
@@ -1122,7 +1122,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
}
/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned nid, next;
struct task_struct *me = current;
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH 2/3] hugetlb: numafy several functions
2007-07-14 20:37 [PATCH v8] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
@ 2007-07-14 20:41 ` Nishanth Aravamudan
2007-07-14 20:43 ` [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
2007-07-14 20:41 ` [PATCH v8] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
1 sibling, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-07-14 20:41 UTC (permalink / raw)
To: clameter; +Cc: lee.schermerhorn, wli, akpm, linux-mm
Add node-parameterized helpers for dequeue_huge_page,
alloc_fresh_huge_page and try_to_free_low. Also have
update_and_free_page() take a nid parameter. This is necessary to add a
per-node sysfs attribute to specify the number of hugepages on that
node.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
mm/hugetlb.c | 99 +++++++++++++++++++++++++++++++++++++----------------------
1 file changed, 63 insertions(+), 36 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1cd3118..31c4359 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,11 +66,22 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(int nid)
+{
+ struct page *page;
+
+ page = list_entry(hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
int nid;
- struct page *page = NULL;
struct zonelist *zonelist = huge_zonelist(vma, address,
htlb_alloc_mask);
struct zone **z;
@@ -82,14 +93,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
break;
}
- if (*z) {
- page = list_entry(hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
- }
- return page;
+ if (*z)
+ return dequeue_huge_page_node(nid);
+ return NULL;
}
static void free_huge_page(struct page *page)
@@ -103,6 +109,25 @@ static void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+ struct page *page;
+
+ page = alloc_pages_node(nid,
+ GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+ HUGETLB_PAGE_ORDER);
+ if (page) {
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ nr_huge_pages++;
+ nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+ }
+
+ return page;
+}
+
static int alloc_fresh_huge_page(struct mempolicy *policy)
{
int nid;
@@ -112,22 +137,12 @@ static int alloc_fresh_huge_page(struct mempolicy *policy)
nid = start_nid;
do {
- page = alloc_pages_node(nid,
- htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
- HUGETLB_PAGE_ORDER);
+ page = alloc_fresh_huge_page_node(nid);
if (page)
- break;
+ return 1;
nid = interleave_nodes(policy);
} while (nid != start_nid);
- if (page) {
- set_compound_page_dtor(page, free_huge_page);
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[page_to_nid(page)]++;
- spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
- return 1;
- }
+
return 0;
}
@@ -203,11 +218,11 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
}
#ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(int nid, struct page *page)
{
int i;
nr_huge_pages--;
- nr_huge_pages_node[page_to_nid(page)]--;
+ nr_huge_pages_node[nid]--;
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -219,25 +234,37 @@ static void update_and_free_page(struct page *page)
}
#ifdef CONFIG_HIGHMEM
+static void try_to_free_low_node(int nid, unsigned long count)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next,
+ &hugepage_freelists[nid], lru) {
+ if (PageHighMem(page))
+ continue;
+ list_del(&page->lru);
+ update_and_free_page(nid, page);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ if (count >= nr_huge_pages_node[nid])
+ return;
+ }
+}
+
static void try_to_free_low(unsigned long count)
{
int i;
for (i = 0; i < MAX_NUMNODES; ++i) {
- struct page *page, *next;
- list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
- if (PageHighMem(page))
- continue;
- list_del(&page->lru);
- update_and_free_page(page);
- free_huge_pages--;
- free_huge_pages_node[page_to_nid(page)]--;
- if (count >= nr_huge_pages)
- return;
- }
+ try_to_free_low_node(i, count);
+ if (count >= nr_huge_pages)
+ break;
}
}
#else
+static inline void try_to_free_low_node(int nid, unsigned long count)
+{
+}
static inline void try_to_free_low(unsigned long count)
{
}
@@ -265,7 +292,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
struct page *page = dequeue_huge_page(NULL, 0);
if (!page)
break;
- update_and_free_page(page);
+ update_and_free_page(page_to_nid(page), page);
}
spin_unlock(&hugetlb_lock);
return nr_huge_pages;
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute
2007-07-14 20:41 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-07-14 20:43 ` Nishanth Aravamudan
2007-07-14 20:44 ` Nishanth Aravamudan
0 siblings, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-07-14 20:43 UTC (permalink / raw)
To: clameter; +Cc: lee.schermerhorn, wli, akpm, linux-mm
Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface). Add callbacks in the sysfs
node registration and unregistration functions into hugetlb to add the
nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
drivers/base/node.c | 2 +
include/linux/hugetlb.h | 11 ++++++++
mm/hugetlb.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cae346e..c9d531f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -151,6 +151,7 @@ int register_node(struct node *node, int num, struct node *parent)
sysdev_create_file(&node->sysdev, &attr_meminfo);
sysdev_create_file(&node->sysdev, &attr_numastat);
sysdev_create_file(&node->sysdev, &attr_distance);
+ hugetlb_register_node(node);
}
return error;
}
@@ -168,6 +169,7 @@ void unregister_node(struct node *node)
sysdev_remove_file(&node->sysdev, &attr_meminfo);
sysdev_remove_file(&node->sysdev, &attr_numastat);
sysdev_remove_file(&node->sysdev, &attr_distance);
+ hugetlb_unregister_node(node);
sysdev_unregister(&node->sysdev);
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 49b7053..2fc188a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -4,7 +4,9 @@
#ifdef CONFIG_HUGETLB_PAGE
#include <linux/mempolicy.h>
+#include <linux/node.h>
#include <linux/shm.h>
+#include <linux/sysdev.h>
#include <asm/tlbflush.h>
struct ctl_table;
@@ -23,6 +25,13 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
int hugetlb_report_meminfo(char *);
int hugetlb_report_node_meminfo(int, char *);
+#ifdef CONFIG_NUMA
+int hugetlb_register_node(struct node *);
+void hugetlb_unregister_node(struct node *);
+#else
+#define hugetlb_register_node(node) 0
+#define hugetlb_unregister_node(node) ((void)0)
+#endif
unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access);
@@ -114,6 +123,8 @@ static inline unsigned long hugetlb_total_pages(void)
#define unmap_hugepage_range(vma, start, end) BUG()
#define hugetlb_report_meminfo(buf) 0
#define hugetlb_report_node_meminfo(n, buf) 0
+#define hugetlb_register_node(node) 0
+#define hugetlb_unregister_node(node) ((void)0)
#define follow_huge_pmd(mm, addr, pmd, write) NULL
#define prepare_hugepage_range(addr,len,pgoff) (-EINVAL)
#define pmd_huge(x) 0
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 31c4359..3f3df46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -217,7 +217,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
return nr;
}
-#ifdef CONFIG_SYSCTL
static void update_and_free_page(int nid, struct page *page)
{
int i;
@@ -270,6 +269,7 @@ static inline void try_to_free_low(unsigned long count)
}
#endif
+#ifdef CONFIG_SYSCTL
static unsigned long set_max_huge_pages(unsigned long count)
{
struct mempolicy *pol;
@@ -343,6 +343,67 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, free_huge_pages_node[nid]);
}
+#ifdef CONFIG_NUMA
+static ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+ unsigned long target;
+ unsigned long free_on_other_nodes;
+ unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+ while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+ if (!alloc_fresh_huge_page_node(nid))
+ return count;
+ }
+ if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+ return count;
+
+ /* need to ensure that our counts are accurate */
+ spin_lock(&hugetlb_lock);
+ free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+ if (free_on_other_nodes >= resv_huge_pages) {
+ /* other nodes can satisfy reserve */
+ target = nr_huge_pages_req;
+ } else {
+ /* this node needs some free to satisfy reserve */
+ target = max((resv_huge_pages - free_on_other_nodes),
+ nr_huge_pages_req);
+ }
+ try_to_free_low_node(nid, target);
+ while (target < nr_huge_pages_node[nid]) {
+ struct page *page = dequeue_huge_page_node(nid);
+ if (!page)
+ break;
+ update_and_free_page(nid, page);
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+ hugetlb_read_nr_hugepages_node,
+ hugetlb_write_nr_hugepages_node);
+
+int hugetlb_register_node(struct node *node)
+{
+ return sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+void hugetlb_unregister_node(struct node *node)
+{
+ sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+#endif
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute
2007-07-14 20:43 ` [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
@ 2007-07-14 20:44 ` Nishanth Aravamudan
0 siblings, 0 replies; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-07-14 20:44 UTC (permalink / raw)
To: clameter; +Cc: lee.schermerhorn, wli, akpm, linux-mm
On 14.07.2007 [13:43:17 -0700], Nishanth Aravamudan wrote:
> hugetlb: add per-node nr_hugepages sysfs attribute
>
> Allow specifying the number of hugepages to allocate on a particular
> node. Our current global sysctl will try its best to put hugepages
> equally on each node, but htat may not always be desired. This allows
> the admin to control the layout of hugepage allocation at a finer level
> (while not breaking the existing interface). Add callbacks in the sysfs
> node registration and unregistration functions into hugetlb to add the
> nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
Eep, forgot to append the following:
Compile tested on x86, x86_64 and ppc64. Run tested on 4-node x86-64 (no
memoryless nodes), non-NUMA x86 and 4-node ppc64 (2 memoryless nodes).
Thanks,
Nish
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v8] Fix hugetlb pool allocation with empty nodes
2007-07-14 20:37 [PATCH v8] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
2007-07-14 20:41 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-07-14 20:41 ` Nishanth Aravamudan
1 sibling, 0 replies; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-07-14 20:41 UTC (permalink / raw)
To: clameter; +Cc: anton, lee.schermerhorn, wli, kxr, akpm, linux-mm
On 14.07.2007 [13:37:33 -0700], Nishanth Aravamudan wrote:
> Fix hugetlb pool allocation with empty nodes
Clearly, this should have been [1/3], sorry for the mistake.
Thanks,
Nish
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/3] Fix hugetlb pool allocation with empty nodes
@ 2007-06-18 17:34 Nishanth Aravamudan
2007-06-18 17:37 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
0 siblings, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-06-18 17:34 UTC (permalink / raw)
To: wli; +Cc: anton, lee.schermerhorn, clameter, linux-mm
Anton found a problem with the hugetlb pool allocation when some nodes
have no memory (http://marc.info/?l=linux-mm&m=118133042025995&w=2). Lee
worked on versions that tried to fix it, but none were accepted.
Christoph has created a set of patches which allow for GFP_THISNODE
allocations to fail if the node has no memory and for exporting a
node_memory_map indicating which nodes have memory. Since mempolicy.c
already has a number of functions which support interleaving, create a
mempolicy when we invoke alloc_fresh_huge_page() that specifies
interleaving across all the nodes in node_memory_map, rather than custom
interleaving code in hugetlb.c. This requires adding some dummy
functions, and some declarations, in mempolicy.h to compile with NUMA or
!NUMA.
Compiled-test w/ NUMA and !NUMA. Run-tested with the follow-on patches
on 4-node ppc64 w/ 2 memoryless nodes and 4-node x86_64 w/ 0 memoryless
nodes.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Lee Schermerhorn <lee.schermerhon@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 22b668c..c8a68b8 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -76,6 +76,8 @@ struct mempolicy {
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
*/
+extern struct mempolicy *mpol_new(int mode, nodemask_t *nodes);
+
extern void __mpol_free(struct mempolicy *pol);
static inline void mpol_free(struct mempolicy *pol)
{
@@ -164,6 +166,8 @@ static inline void check_highest_zone(enum zone_type k)
policy_zone = k;
}
+extern unsigned interleave_nodes(struct mempolicy *policy);
+
int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
@@ -179,6 +183,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
#define mpol_set_vma_default(vma) do {} while(0)
+static inline struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+{
+ return NULL;
+}
+
static inline void mpol_free(struct mempolicy *p)
{
}
@@ -267,6 +276,11 @@ static inline int do_migrate_pages(struct mm_struct *mm,
static inline void check_highest_zone(int k)
{
}
+
+static inline unsigned interleave_nodes(struct mempolicy *policy)
+{
+ return 0;
+}
#endif /* CONFIG_NUMA */
#endif /* __KERNEL__ */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 858c0b3..88e1a30 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -103,15 +103,22 @@ static void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct mempolicy *policy)
{
- static int nid = 0;
+ int nid;
struct page *page;
- page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
+ int start_nid = interleave_nodes(policy);
+
+ nid = start_nid;
+
+ do {
+ page = alloc_pages_node(nid,
+ htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
+ HUGETLB_PAGE_ORDER);
+ if (page)
+ break;
+ nid = interleave_nodes(policy);
+ } while (nid != start_nid);
if (page) {
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
@@ -153,6 +160,7 @@ fail:
static int __init hugetlb_init(void)
{
unsigned long i;
+ struct mempolicy *pol;
if (HPAGE_SHIFT == 0)
return 0;
@@ -160,11 +168,16 @@ static int __init hugetlb_init(void)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&hugepage_freelists[i]);
+ pol = mpol_new(MPOL_INTERLEAVE, &node_memory_map);
+ if (IS_ERR(pol))
+ goto quit;
for (i = 0; i < max_huge_pages; ++i) {
- if (!alloc_fresh_huge_page())
+ if (!alloc_fresh_huge_page(pol))
break;
}
+ mpol_free(pol);
max_huge_pages = free_huge_pages = nr_huge_pages = i;
+quit:
printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
return 0;
}
@@ -232,10 +245,16 @@ static inline void try_to_free_low(unsigned long count)
static unsigned long set_max_huge_pages(unsigned long count)
{
+ struct mempolicy *pol;
+
+ pol = mpol_new(MPOL_INTERLEAVE, &node_memory_map);
+ if (IS_ERR(pol))
+ return nr_huge_pages;
while (count > nr_huge_pages) {
- if (!alloc_fresh_huge_page())
- return nr_huge_pages;
+ if (!alloc_fresh_huge_page(pol))
+ break;
}
+ mpol_free(pol);
if (count >= nr_huge_pages)
return nr_huge_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d2289f3..00bf061 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -171,7 +171,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
}
/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
@@ -1121,7 +1121,7 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
}
/* Do dynamic interleaving for a process */
-static unsigned interleave_nodes(struct mempolicy *policy)
+unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned nid, next;
struct task_struct *me = current;
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH 2/3] hugetlb: numafy several functions
2007-06-18 17:34 [PATCH 1/3] " Nishanth Aravamudan
@ 2007-06-18 17:37 ` Nishanth Aravamudan
2007-06-18 17:53 ` [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
0 siblings, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-06-18 17:37 UTC (permalink / raw)
To: wli; +Cc: anton, lee.schermerhorn, clameter, linux-mm
Add node-parameterized helpers for dequeue_huge_page,
alloc_fresh_huge_page and try_to_free_low. Also have
update_and_free_page() take a nid parameter. This is necessary to add a
per-node sysfs attribute to specify the number of hugepages on that
node.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88e1a30..ca89057 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,11 +66,22 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(int nid)
+{
+ struct page *page;
+
+ page = list_entry(hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
int nid;
- struct page *page = NULL;
struct zonelist *zonelist = huge_zonelist(vma, address,
htlb_alloc_mask);
struct zone **z;
@@ -82,14 +93,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
break;
}
- if (*z) {
- page = list_entry(hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
- }
- return page;
+ if (*z)
+ return dequeue_huge_page_node(nid);
+ return NULL;
}
static void free_huge_page(struct page *page)
@@ -103,6 +109,25 @@ static void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+ struct page *page;
+
+ page = alloc_pages_node(nid,
+ GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+ HUGETLB_PAGE_ORDER);
+ if (page) {
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ nr_huge_pages++;
+ nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+ }
+
+ return page;
+}
+
static int alloc_fresh_huge_page(struct mempolicy *policy)
{
int nid;
@@ -112,22 +137,12 @@ static int alloc_fresh_huge_page(struct mempolicy *policy)
nid = start_nid;
do {
- page = alloc_pages_node(nid,
- htlb_alloc_mask|__GFP_COMP|GFP_THISNODE,
- HUGETLB_PAGE_ORDER);
+ page = alloc_fresh_huge_page_node(nid);
if (page)
- break;
+ return 1;
nid = interleave_nodes(policy);
} while (nid != start_nid);
- if (page) {
- set_compound_page_dtor(page, free_huge_page);
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[page_to_nid(page)]++;
- spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
- return 1;
- }
+
return 0;
}
@@ -203,11 +218,11 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
}
#ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(int nid, struct page *page)
{
int i;
nr_huge_pages--;
- nr_huge_pages_node[page_to_nid(page)]--;
+ nr_huge_pages_node[nid]--;
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -219,25 +234,37 @@ static void update_and_free_page(struct page *page)
}
#ifdef CONFIG_HIGHMEM
+static void try_to_free_low_node(int nid, unsigned long count)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next,
+ &hugepage_freelists[nid], lru) {
+ if (PageHighMem(page))
+ continue;
+ list_del(&page->lru);
+ update_and_free_page(nid, page);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ if (count >= nr_huge_pages_node[nid])
+ return;
+ }
+}
+
static void try_to_free_low(unsigned long count)
{
int i;
for (i = 0; i < MAX_NUMNODES; ++i) {
- struct page *page, *next;
- list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
- if (PageHighMem(page))
- continue;
- list_del(&page->lru);
- update_and_free_page(page);
- free_huge_pages--;
- free_huge_pages_node[page_to_nid(page)]--;
- if (count >= nr_huge_pages)
- return;
- }
+ try_to_free_low_node(i, count);
+ if (count >= nr_huge_pages)
+ break;
}
}
#else
+static inline void try_to_free_low_node(int nid, unsigned long count)
+{
+}
static inline void try_to_free_low(unsigned long count)
{
}
@@ -265,7 +292,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
struct page *page = dequeue_huge_page(NULL, 0);
if (!page)
break;
- update_and_free_page(page);
+ update_and_free_page(page_to_nid(page), page);
}
spin_unlock(&hugetlb_lock);
return nr_huge_pages;
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute
2007-06-18 17:37 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-06-18 17:53 ` Nishanth Aravamudan
0 siblings, 0 replies; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-06-18 17:53 UTC (permalink / raw)
To: wli; +Cc: anton, lee.schermerhorn, clameter, linux-mm
Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface). Add callbacks in the sysfs
node registration and unregistration functions into hugetlb to add the
nr_hugepages attribute, which is a no-op if !NUMA or !HUGETLB.
Compile-tested with NUMA X HUGETLB (all 4 on/off combinations).
Run-tested on 4-node ppc64 w/ 2 memoryless nodes and 4-node x86_64 w/ 0
memoryless nodes.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cae346e..c9d531f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -151,6 +151,7 @@ int register_node(struct node *node, int num, struct node *parent)
sysdev_create_file(&node->sysdev, &attr_meminfo);
sysdev_create_file(&node->sysdev, &attr_numastat);
sysdev_create_file(&node->sysdev, &attr_distance);
+ hugetlb_register_node(node);
}
return error;
}
@@ -168,6 +169,7 @@ void unregister_node(struct node *node)
sysdev_remove_file(&node->sysdev, &attr_meminfo);
sysdev_remove_file(&node->sysdev, &attr_numastat);
sysdev_remove_file(&node->sysdev, &attr_distance);
+ hugetlb_unregister_node(node);
sysdev_unregister(&node->sysdev);
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aa0dc9b..0d97cd4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -4,7 +4,9 @@
#ifdef CONFIG_HUGETLB_PAGE
#include <linux/mempolicy.h>
+#include <linux/node.h>
#include <linux/shm.h>
+#include <linux/sysdev.h>
#include <asm/tlbflush.h>
struct ctl_table;
@@ -23,6 +25,13 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
int hugetlb_report_meminfo(char *);
int hugetlb_report_node_meminfo(int, char *);
+#ifdef CONFIG_NUMA
+int hugetlb_register_node(struct node *);
+void hugetlb_unregister_node(struct node *);
+#else
+#define hugetlb_register_node(node) 0
+#define hugetlb_unregister_node(node) ((void)0)
+#endif
unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access);
@@ -114,6 +123,8 @@ static inline unsigned long hugetlb_total_pages(void)
#define unmap_hugepage_range(vma, start, end) BUG()
#define hugetlb_report_meminfo(buf) 0
#define hugetlb_report_node_meminfo(n, buf) 0
+#define hugetlb_register_node(node) 0
+#define hugetlb_unregister_node(node) ((void)0)
#define follow_huge_pmd(mm, addr, pmd, write) NULL
#define prepare_hugepage_range(addr,len,pgoff) (-EINVAL)
#define pmd_huge(x) 0
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca89057..7e737d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -217,7 +217,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
return nr;
}
-#ifdef CONFIG_SYSCTL
static void update_and_free_page(int nid, struct page *page)
{
int i;
@@ -270,6 +269,7 @@ static inline void try_to_free_low(unsigned long count)
}
#endif
+#ifdef CONFIG_SYSCTL
static unsigned long set_max_huge_pages(unsigned long count)
{
struct mempolicy *pol;
@@ -343,6 +343,67 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, free_huge_pages_node[nid]);
}
+#ifdef CONFIG_NUMA
+static ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+static ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+ unsigned long target;
+ unsigned long free_on_other_nodes;
+ unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+ while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+ if (!alloc_fresh_huge_page_node(nid))
+ return count;
+ }
+ if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+ return count;
+
+ /* need to ensure that our counts are accurate */
+ spin_lock(&hugetlb_lock);
+ free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+ if (free_on_other_nodes >= resv_huge_pages) {
+ /* other nodes can satisfy reserve */
+ target = nr_huge_pages_req;
+ } else {
+ /* this node needs some free to satisfy reserve */
+ target = max((resv_huge_pages - free_on_other_nodes),
+ nr_huge_pages_req);
+ }
+ try_to_free_low_node(nid, target);
+ while (target < nr_huge_pages_node[nid]) {
+ struct page *page = dequeue_huge_page_node(nid);
+ if (!page)
+ break;
+ update_and_free_page(nid, page);
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+ hugetlb_read_nr_hugepages_node,
+ hugetlb_write_nr_hugepages_node);
+
+int hugetlb_register_node(struct node *node)
+{
+ return sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+void hugetlb_unregister_node(struct node *node)
+{
+ sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+}
+
+#endif
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH v5][1/3] Fix hugetlb pool allocation with empty nodes
@ 2007-06-08 19:06 Nishanth Aravamudan, Lee Schermerhorn
2007-06-08 19:07 ` [PATCH][2/3] hugetlb: numafy several functions Nishanth Aravamudan
0 siblings, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan, Lee Schermerhorn @ 2007-06-08 19:06 UTC (permalink / raw)
To: clameter; +Cc: akpm, lee.schermerhorn, anton, wli, linux-mm
Original Patch [V1]:
Date: Wed, 2 May 2007 21:21:07 -0500
From: Anton Blanchard <anton@samba.org>
To: linux-mm@kvack.org, clameter@SGI.com, ak@suse.de
Cc: nish.aravamudan@gmail.com, mel@csn.ul.ie, apw@shadowen.org
Subject: [PATCH] Fix hugetlb pool allocation with empty nodes
An interesting bug was pointed out to me where we failed to allocate
hugepages evenly. In the example below node 7 has no memory (it only has
CPUs). Node 0 and 1 have plenty of free memory. After doing:
We see the imbalance:
Node 0 HugePages_Total: 6
Node 1 HugePages_Total: 10
Node 7 HugePages_Total: 0
It didn't take long to realise that alloc_fresh_huge_page is allocating
from node 7 without GFP_THISNODE set, so we fallback to its next
preferred node (ie 1). This means we end up with a 1/3 2/3 imbalance.
After fixing this it still didnt work, and after some more poking I see
why. When building our fallback zonelist in build_zonelists_node we skip
empty zones. This means zone 7 never registers node 7's empty zonelists
and instead registers node 1's. Therefore when we ask for a page from
node 7, using the GFP_THISNODE flag we end up with node 1 memory.
By removing the populated_zone() check in build_zonelists_node we fix
the problem:
Node 0 HugePages_Total: 8
Node 1 HugePages_Total: 8
Node 7 HugePages_Total: 0
Im guessing registering empty remote zones might make the SGI guys a bit
unhappy, maybe we should just force the registration of empty local
zones? Does anyone care?
Rework [should have been V3]
Create node_populated_map and access functions [nodemask.h] to describe
nodes with populated gfp_zone(GFP_HIGHUSER). Note that on x86, this
excludes nodes with only DMA* or NORMAL memory--i.e., no hugepages below
4G.
Populate the map while building zonelists, where we already check for
populated zones. This is early enough for boot time allocation of huge
pages.
Attempt to allocate "fresh" huge pages only from nodes in the populated
map.
Tested on ia64 numa platform with both boot time and run time allocation
of huge pages.
Rework "V4":
+ rebase to 22-rc1-mm1 with "change zonelist order" series:
+ redefine node_populated_map to contain nodes whose first zone in the
'policy_zone' zonelist is "on node". This will be used to filter nodes
for hugepage allocation. Note: if some node has only DMA32, but
policy_zone is > DMA32 [some other node/s has/have memory in higher
zones] AND we're building the zonelists in zone order, we won't mark
this zone as populated. No hugepages will be allocated from such a
node.
+ fix typos in comments per Nish Aravamudan.
+ rework allocate_fresh_huge_page() to just scan the populated map,
again Nish's suggestion.
Rework "V5":
+ rebase to 22-rc4-mm2
+ tested on non-NUMA x86, non-NUMA ppc64, 2-node ia64, 4-node x86_64,
and 4-node ppc64 with 2 unpopulated nodes.
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Tested-by: Nishanth Aravamudan <nacc@us.ibm.com>
diff a/include/linux/nodemask.h b/include/linux/nodemask.h
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -64,12 +64,16 @@
*
* int node_online(node) Is some node online?
* int node_possible(node) Is some node possible?
+ * int node_populated(node) Is some node populated [at policy_zone]
*
* int any_online_node(mask) First online node in mask
*
* node_set_online(node) set bit 'node' in node_online_map
* node_set_offline(node) clear bit 'node' in node_online_map
*
+ * node_set_populated(node) set bit 'node' in node_populated_map
+ * node_not_populated(node) clear bit 'node' in node_populated_map
+ *
* for_each_node(node) for-loop node over node_possible_map
* for_each_online_node(node) for-loop node over node_online_map
*
@@ -344,12 +348,14 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
extern nodemask_t node_online_map;
extern nodemask_t node_possible_map;
+extern nodemask_t node_populated_map;
#if MAX_NUMNODES > 1
#define num_online_nodes() nodes_weight(node_online_map)
#define num_possible_nodes() nodes_weight(node_possible_map)
#define node_online(node) node_isset((node), node_online_map)
#define node_possible(node) node_isset((node), node_possible_map)
+#define node_populated(node) node_isset((node), node_populated_map)
#define first_online_node first_node(node_online_map)
#define next_online_node(nid) next_node((nid), node_online_map)
extern int nr_node_ids;
@@ -358,6 +364,7 @@ extern int nr_node_ids;
#define num_possible_nodes() 1
#define node_online(node) ((node) == 0)
#define node_possible(node) ((node) == 0)
+#define node_populated(node) ((node) == 0)
#define first_online_node 0
#define next_online_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1
@@ -375,6 +382,9 @@ extern int nr_node_ids;
#define node_set_online(node) set_bit((node), node_online_map.bits)
#define node_set_offline(node) clear_bit((node), node_online_map.bits)
+#define node_set_populated(node) set_bit((node), node_populated_map.bits)
+#define node_not_populated(node) clear_bit((node), node_populated_map.bits)
+
#define for_each_node(node) for_each_node_mask((node), node_possible_map)
#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
diff a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,13 +105,22 @@ static void free_huge_page(struct page *page)
static int alloc_fresh_huge_page(void)
{
- static int nid = 0;
+ static int nid = -1;
struct page *page;
- page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
- HUGETLB_PAGE_ORDER);
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
+ int start_nid;
+
+ if (nid < 0)
+ nid = first_node(node_populated_map);
+ start_nid = nid;
+
+ do {
+ page = alloc_pages_node(nid,
+ GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+ HUGETLB_PAGE_ORDER);
+ nid = next_node(nid, node_populated_map);
+ if (nid >= nr_node_ids)
+ nid = first_node(node_populated_map);
+ } while (!page && nid != start_nid);
if (page) {
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
diff a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,9 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
EXPORT_SYMBOL(node_online_map);
nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
EXPORT_SYMBOL(node_possible_map);
+nodemask_t node_populated_map __read_mostly = NODE_MASK_NONE;
+EXPORT_SYMBOL(node_populated_map);
+
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
long nr_swap_pages;
@@ -2161,7 +2164,7 @@ static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
enum zone_type i;
- int pos, j, node;
+ int pos, j;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
@@ -2171,7 +2174,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
pos = 0;
for (zone_type = i; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
+ int node = node_order[j];
z = &NODE_DATA(node)->node_zones[zone_type];
if (populated_zone(z)) {
zonelist->zones[pos++] = z;
@@ -2244,6 +2247,22 @@ static void set_zonelist_order(void)
current_zonelist_order = user_zonelist_order;
}
+/*
+ * setup_populate_map() - record nodes whose "policy_zone" is "on-node".
+ */
+static void setup_populated_map(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct zonelist *zl = pgdat->node_zonelists + policy_zone;
+ struct zone *z = zl->zones[0];
+
+ VM_BUG_ON(!z);
+ if (z->zone_pgdat == pgdat)
+ node_set_populated(nid);
+ else
+ node_not_populated(nid);
+}
+
static void build_zonelists(pg_data_t *pgdat)
{
int j, node, load;
@@ -2327,6 +2346,15 @@ static void set_zonelist_order(void)
current_zonelist_order = ZONELIST_ORDER_ZONE;
}
+/*
+ * setup_populated_map - non-NUMA case
+ * Only node 0 should be on-line, and it MUST be populated!
+ */
+static void setup_populated_map(int nid)
+{
+ node_set_populated(nid);
+}
+
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
@@ -2381,6 +2409,7 @@ static int __build_all_zonelists(void *dummy)
for_each_online_node(nid) {
build_zonelists(NODE_DATA(nid));
build_zonelist_cache(NODE_DATA(nid));
+ setup_populated_map(nid);
}
return 0;
}
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH][2/3] hugetlb: numafy several functions
2007-06-08 19:06 [PATCH v5][1/3] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan, Lee Schermerhorn
@ 2007-06-08 19:07 ` Nishanth Aravamudan
2007-06-08 19:08 ` [PATCH][3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
0 siblings, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-06-08 19:07 UTC (permalink / raw)
To: clameter; +Cc: akpm, lee.schermerhorn, anton, wli, linux-mm
Rebased against 2.6.22-rc4-mm2 with:
fix-hugetlb-pool-allocation-with-empty-nodes-v5.patch
Add node-parameterized helpers for dequeue_huge_page,
alloc_fresh_huge_page and try_to_free_low. Also have
update_and_free_page() take a nid parameter. This is necessary to add a
per-node sysfs attribute to specify the number of hugepages on that
node.
Tested on non-NUMA x86, non-NUMA ppc64, 2-node IA64, 4-node x86_64 and
4-node ppc64 with 2 unpopulated nodes.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
diff a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,11 +66,22 @@ static void enqueue_huge_page(struct page *page)
free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(int nid)
+{
+ struct page *page;
+
+ page = list_entry(hugepage_freelists[nid].next,
+ struct page, lru);
+ list_del(&page->lru);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page(struct vm_area_struct *vma,
unsigned long address)
{
int nid;
- struct page *page = NULL;
struct zonelist *zonelist = huge_zonelist(vma, address,
htlb_alloc_mask);
struct zone **z;
@@ -82,14 +93,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
break;
}
- if (*z) {
- page = list_entry(hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- free_huge_pages--;
- free_huge_pages_node[nid]--;
- }
- return page;
+ if (*z)
+ return dequeue_huge_page_node(nid);
+ return NULL;
}
static void free_huge_page(struct page *page)
@@ -103,6 +109,25 @@ static void free_huge_page(struct page *page)
spin_unlock(&hugetlb_lock);
}
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+ struct page *page;
+
+ page = alloc_pages_node(nid,
+ GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
+ HUGETLB_PAGE_ORDER);
+ if (page) {
+ set_compound_page_dtor(page, free_huge_page);
+ spin_lock(&hugetlb_lock);
+ nr_huge_pages++;
+ nr_huge_pages_node[nid]++;
+ spin_unlock(&hugetlb_lock);
+ put_page(page); /* free it into the hugepage allocator */
+ }
+
+ return page;
+}
+
static int alloc_fresh_huge_page(void)
{
static int nid = -1;
@@ -114,22 +139,14 @@ static int alloc_fresh_huge_page(void)
start_nid = nid;
do {
- page = alloc_pages_node(nid,
- GFP_HIGHUSER|__GFP_COMP|GFP_THISNODE,
- HUGETLB_PAGE_ORDER);
+ page = alloc_fresh_huge_page_node(nid);
nid = next_node(nid, node_populated_map);
if (nid >= nr_node_ids)
nid = first_node(node_populated_map);
} while (!page && nid != start_nid);
- if (page) {
- set_compound_page_dtor(page, free_huge_page);
- spin_lock(&hugetlb_lock);
- nr_huge_pages++;
- nr_huge_pages_node[page_to_nid(page)]++;
- spin_unlock(&hugetlb_lock);
- put_page(page); /* free it into the hugepage allocator */
+
+ if (page)
return 1;
- }
return 0;
}
@@ -199,11 +216,11 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
}
#ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(int nid, struct page *page)
{
int i;
nr_huge_pages--;
- nr_huge_pages_node[page_to_nid(page)]--;
+ nr_huge_pages_node[nid]--;
for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
@@ -215,25 +232,37 @@ static void update_and_free_page(struct page *page)
}
#ifdef CONFIG_HIGHMEM
+static void try_to_free_low_node(int nid, unsigned long count)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next,
+ &hugepage_freelists[nid], lru) {
+ if (PageHighMem(page))
+ continue;
+ list_del(&page->lru);
+ update_and_free_page(nid, page);
+ free_huge_pages--;
+ free_huge_pages_node[nid]--;
+ if (count >= nr_huge_pages_node[nid])
+ return;
+ }
+}
+
static void try_to_free_low(unsigned long count)
{
int i;
for (i = 0; i < MAX_NUMNODES; ++i) {
- struct page *page, *next;
- list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
- if (PageHighMem(page))
- continue;
- list_del(&page->lru);
- update_and_free_page(page);
- free_huge_pages--;
- free_huge_pages_node[page_to_nid(page)]--;
- if (count >= nr_huge_pages)
- return;
- }
+ try_to_free_low_node(i, count);
+ if (count >= nr_huge_pages)
+ break;
}
}
#else
+static inline void try_to_free_low_node(int nid, unsigned long count)
+{
+}
static inline void try_to_free_low(unsigned long count)
{
}
@@ -255,7 +284,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
struct page *page = dequeue_huge_page(NULL, 0);
if (!page)
break;
- update_and_free_page(page);
+ update_and_free_page(page_to_nid(page), page);
}
spin_unlock(&hugetlb_lock);
return nr_huge_pages;
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* [PATCH][3/3] hugetlb: add per-node nr_hugepages sysfs attribute
2007-06-08 19:07 ` [PATCH][2/3] hugetlb: numafy several functions Nishanth Aravamudan
@ 2007-06-08 19:08 ` Nishanth Aravamudan
2007-06-08 19:10 ` Nishanth Aravamudan
0 siblings, 1 reply; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-06-08 19:08 UTC (permalink / raw)
To: clameter; +Cc: akpm, lee.schermerhorn, anton, wli, linux-mm
Rebased against 2.6.22-rc4-mm2 with:
fix-hugetlb-pool-allocation-with-empty-nodes-v5.patch
hugetlb-numafy-several-functions
Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface).
Note: this requires making the same sort of check in the sysfs write
callback as in the normal allocation path, for populated nodes.
Tested on non-NUMA x86, non-NUMA ppc64, 2-node IA64, 4-node x86_64 and
4-node ppc64 with 2 unpopulated nodes.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
---
I would have liked to have avoided the #ifdef's in node.c, but I
couldn't figure out a simple way to conditionalize the
create_file/remove_file calls.
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cae346e..fc0b4a1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -131,6 +131,11 @@ static ssize_t node_read_distance(struct sys_device * dev, char * buf)
}
static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
+#ifdef CONFIG_HUGETLB_PAGE
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+ hugetlb_read_nr_hugepages_node,
+ hugetlb_write_nr_hugepages_node);
+#endif
/*
* register_node - Setup a sysfs device for a node.
@@ -151,6 +156,9 @@ int register_node(struct node *node, int num, struct node *parent)
sysdev_create_file(&node->sysdev, &attr_meminfo);
sysdev_create_file(&node->sysdev, &attr_numastat);
sysdev_create_file(&node->sysdev, &attr_distance);
+#ifdef CONFIG_HUGETLB_PAGE
+ sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+#endif
}
return error;
}
@@ -168,6 +176,9 @@ void unregister_node(struct node *node)
sysdev_remove_file(&node->sysdev, &attr_meminfo);
sysdev_remove_file(&node->sysdev, &attr_numastat);
sysdev_remove_file(&node->sysdev, &attr_distance);
+#ifdef CONFIG_HUGETLB_PAGE
+ sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+#endif
sysdev_unregister(&node->sysdev);
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aa0dc9b..7df75c1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -5,6 +5,7 @@
#include <linux/mempolicy.h>
#include <linux/shm.h>
+#include <linux/sysdev.h>
#include <asm/tlbflush.h>
struct ctl_table;
@@ -23,6 +24,9 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
int hugetlb_report_meminfo(char *);
int hugetlb_report_node_meminfo(int, char *);
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *, char *);
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *, const char *,
+ size_t);
unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d1e1063..9f1cb16 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -215,7 +215,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
return nr;
}
-#ifdef CONFIG_SYSCTL
static void update_and_free_page(int nid, struct page *page)
{
int i;
@@ -268,6 +267,7 @@ static inline void try_to_free_low(unsigned long count)
}
#endif
+#ifdef CONFIG_SYSCTL
static unsigned long set_max_huge_pages(unsigned long count)
{
while (count > nr_huge_pages) {
@@ -335,6 +335,58 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, free_huge_pages_node[nid]);
}
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+ unsigned long target;
+ unsigned long free_on_other_nodes;
+ unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+ /*
+ * unpopulated nodes can return pages from other nodes for
+ * THISNODE requests, so do a populated check and avoid
+ * double-checking in the sysctl path
+ */
+ if (!node_populated(nid))
+ return count;
+
+ while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+ if (!alloc_fresh_huge_page_node(nid))
+ return count;
+ }
+ if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+ return count;
+
+ /* need to ensure that our counts are accurate */
+ spin_lock(&hugetlb_lock);
+ free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+ if (free_on_other_nodes >= resv_huge_pages) {
+ /* other nodes can satisfy reserve */
+ target = nr_huge_pages_req;
+ } else {
+ /* this node needs some free to satisfy reserve */
+ target = max((resv_huge_pages - free_on_other_nodes),
+ nr_huge_pages_req);
+ }
+ try_to_free_low_node(nid, target);
+ while (target < nr_huge_pages_node[nid]) {
+ struct page *page = dequeue_huge_page_node(nid);
+ if (!page)
+ break;
+ update_and_free_page(nid, page);
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH][3/3] hugetlb: add per-node nr_hugepages sysfs attribute
2007-06-08 19:08 ` [PATCH][3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
@ 2007-06-08 19:10 ` Nishanth Aravamudan
0 siblings, 0 replies; 8+ messages in thread
From: Nishanth Aravamudan @ 2007-06-08 19:10 UTC (permalink / raw)
To: clameter; +Cc: akpm, lee.schermerhorn, anton, wli, linux-mm
On 08.06.2007 [12:08:57 -0700], Nishanth Aravamudan wrote:
> hugetlb: add per-node nr_hugepages sysfs attribute
Sorry, meant to send the non-gitified version of the patch...
hugetlb: add per-node nr_hugepages sysfs attribute
Rebased against 2.6.22-rc4-mm2 with:
fix-hugetlb-pool-allocation-with-empty-nodes-v5.patch
hugetlb-numafy-several-functions
Allow specifying the number of hugepages to allocate on a particular
node. Our current global sysctl will try its best to put hugepages
equally on each node, but htat may not always be desired. This allows
the admin to control the layout of hugepage allocation at a finer level
(while not breaking the existing interface).
Note: this requires making the same sort of check in the sysfs write
callback as in the normal allocation path, for populated nodes.
Tested on non-NUMA x86, non-NUMA ppc64, 2-node IA64, 4-node x86_64 and
4-node ppc64 with 2 unpopulated nodes.
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
---
I would have liked to have avoided the #ifdef's in node.c, but I
couldn't figure out a simple way to conditionalize the
create_file/remove_file calls.
diff a/drivers/base/node.c b/drivers/base/node.c
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -131,6 +131,11 @@ static ssize_t node_read_distance(struct sys_device * dev, char * buf)
}
static SYSDEV_ATTR(distance, S_IRUGO, node_read_distance, NULL);
+#ifdef CONFIG_HUGETLB_PAGE
+static SYSDEV_ATTR(nr_hugepages, S_IRUGO | S_IWUSR,
+ hugetlb_read_nr_hugepages_node,
+ hugetlb_write_nr_hugepages_node);
+#endif
/*
* register_node - Setup a sysfs device for a node.
@@ -151,6 +156,9 @@ int register_node(struct node *node, int num, struct node *parent)
sysdev_create_file(&node->sysdev, &attr_meminfo);
sysdev_create_file(&node->sysdev, &attr_numastat);
sysdev_create_file(&node->sysdev, &attr_distance);
+#ifdef CONFIG_HUGETLB_PAGE
+ sysdev_create_file(&node->sysdev, &attr_nr_hugepages);
+#endif
}
return error;
}
@@ -168,6 +176,9 @@ void unregister_node(struct node *node)
sysdev_remove_file(&node->sysdev, &attr_meminfo);
sysdev_remove_file(&node->sysdev, &attr_numastat);
sysdev_remove_file(&node->sysdev, &attr_distance);
+#ifdef CONFIG_HUGETLB_PAGE
+ sysdev_remove_file(&node->sysdev, &attr_nr_hugepages);
+#endif
sysdev_unregister(&node->sysdev);
}
diff a/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -5,6 +5,7 @@
#include <linux/mempolicy.h>
#include <linux/shm.h>
+#include <linux/sysdev.h>
#include <asm/tlbflush.h>
struct ctl_table;
@@ -23,6 +24,9 @@ void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned lon
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
int hugetlb_report_meminfo(char *);
int hugetlb_report_node_meminfo(int, char *);
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *, char *);
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *, const char *,
+ size_t);
unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access);
diff a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -215,7 +215,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
return nr;
}
-#ifdef CONFIG_SYSCTL
static void update_and_free_page(int nid, struct page *page)
{
int i;
@@ -268,6 +267,7 @@ static inline void try_to_free_low(unsigned long count)
}
#endif
+#ifdef CONFIG_SYSCTL
static unsigned long set_max_huge_pages(unsigned long count)
{
while (count > nr_huge_pages) {
@@ -335,6 +335,58 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
nid, free_huge_pages_node[nid]);
}
+ssize_t hugetlb_read_nr_hugepages_node(struct sys_device *dev,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", nr_huge_pages_node[dev->id]);
+}
+
+ssize_t hugetlb_write_nr_hugepages_node(struct sys_device *dev,
+ const char *buf, size_t count)
+{
+ int nid = dev->id;
+ unsigned long target;
+ unsigned long free_on_other_nodes;
+ unsigned long nr_huge_pages_req = simple_strtoul(buf, NULL, 10);
+
+ /*
+ * unpopulated nodes can return pages from other nodes for
+ * THISNODE requests, so do a populated check and avoid
+ * double-checking in the sysctl path
+ */
+ if (!node_populated(nid))
+ return count;
+
+ while (nr_huge_pages_req > nr_huge_pages_node[nid]) {
+ if (!alloc_fresh_huge_page_node(nid))
+ return count;
+ }
+ if (nr_huge_pages_req >= nr_huge_pages_node[nid])
+ return count;
+
+ /* need to ensure that our counts are accurate */
+ spin_lock(&hugetlb_lock);
+ free_on_other_nodes = free_huge_pages - free_huge_pages_node[nid];
+ if (free_on_other_nodes >= resv_huge_pages) {
+ /* other nodes can satisfy reserve */
+ target = nr_huge_pages_req;
+ } else {
+ /* this node needs some free to satisfy reserve */
+ target = max((resv_huge_pages - free_on_other_nodes),
+ nr_huge_pages_req);
+ }
+ try_to_free_low_node(nid, target);
+ while (target < nr_huge_pages_node[nid]) {
+ struct page *page = dequeue_huge_page_node(nid);
+ if (!page)
+ break;
+ update_and_free_page(nid, page);
+ }
+ spin_unlock(&hugetlb_lock);
+
+ return count;
+}
+
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
--
Nishanth Aravamudan <nacc@us.ibm.com>
IBM Linux Technology Center
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2007-07-14 20:44 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-14 20:37 [PATCH v8] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
2007-07-14 20:41 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
2007-07-14 20:43 ` [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
2007-07-14 20:44 ` Nishanth Aravamudan
2007-07-14 20:41 ` [PATCH v8] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan
-- strict thread matches above, loose matches on Subject: below --
2007-06-18 17:34 [PATCH 1/3] " Nishanth Aravamudan
2007-06-18 17:37 ` [PATCH 2/3] hugetlb: numafy several functions Nishanth Aravamudan
2007-06-18 17:53 ` [PATCH 3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
2007-06-08 19:06 [PATCH v5][1/3] Fix hugetlb pool allocation with empty nodes Nishanth Aravamudan, Lee Schermerhorn
2007-06-08 19:07 ` [PATCH][2/3] hugetlb: numafy several functions Nishanth Aravamudan
2007-06-08 19:08 ` [PATCH][3/3] hugetlb: add per-node nr_hugepages sysfs attribute Nishanth Aravamudan
2007-06-08 19:10 ` Nishanth Aravamudan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox