* [PATCH/RFC 2/4] VM: page cache reclaim core
[not found] <20050427145734.GL8018@localhost>
2005-04-27 15:09 ` [PATCH/RFC 1/4] VM: merge_lru_pages Martin Hicks
@ 2005-04-27 15:09 ` Martin Hicks
2005-04-27 23:32 ` Andrew Morton
2005-04-27 15:09 ` [PATCH/RFC 3/4] VM: toss_page_cache_node syscall Martin Hicks
2005-04-27 15:10 ` [PATCH/RFC 4/4] VM: automatic reclaim through mempolicy Martin Hicks
3 siblings, 1 reply; 10+ messages in thread
From: Martin Hicks @ 2005-04-27 15:09 UTC (permalink / raw)
To: Andrew Morton, Linux-MM; +Cc: Ray Bryant, ak
The core of the local reclaim code. It contains a few modifications
to the current reclaim code to support scanning for easily freed
Active pages. The core routine for reclaiming easily freed pages
is reclaim_clean_pages().
The motivation for this patch is for NUMA systems that would much
prefer to get local memory allocations if possible. Large performance
regressions have been seen in situations as simple as compiling
kernels on a busy build server with a lot of memory trapped in page
cache.
The feature adds the core mechanism to free up caches, although page cache
freeing is the only one implemented currently. Cleaning the slab
cache is a future goal.
The follow-on patches provide a manual reclaim and automatic reclaim method.
Signed-off-by: Martin Hicks <mort@sgi.com>
---
include/linux/mmzone.h | 13 +++
include/linux/swap.h | 15 +++
mm/page_alloc.c | 8 +-
mm/vmscan.c | 185 +++++++++++++++++++++++++++++++++++++++++++++----
4 files changed, 205 insertions(+), 16 deletions(-)
Index: linux-2.6.12-rc2.wk/mm/vmscan.c
===================================================================
--- linux-2.6.12-rc2.wk.orig/mm/vmscan.c 2005-04-27 06:56:48.000000000 -0700
+++ linux-2.6.12-rc2.wk/mm/vmscan.c 2005-04-27 06:56:57.000000000 -0700
@@ -73,6 +73,12 @@ struct scan_control {
unsigned int gfp_mask;
int may_writepage;
+ int may_swap;
+
+ /* Flags to indicate what kind of pages to free during
+ * calls into reclaim_clean_pages() and shrink_list().
+ */
+ int reclaim_flags;
/* This context's SWAP_CLUSTER_MAX. If freeing memory for
* suspend, we effectively ignore SWAP_CLUSTER_MAX.
@@ -376,6 +382,10 @@ static int shrink_list(struct list_head
struct pagevec freed_pvec;
int pgactivate = 0;
int reclaimed = 0;
+ int reclaim_active = sc->reclaim_flags &
+ (RECLAIM_ACTIVE_UNMAPPED | RECLAIM_ACTIVE_MAPPED);
+ int reclaim_mapped = sc->reclaim_flags &
+ (RECLAIM_MAPPED | RECLAIM_ACTIVE_MAPPED);
cond_resched();
@@ -394,7 +404,10 @@ static int shrink_list(struct list_head
if (TestSetPageLocked(page))
goto keep;
- BUG_ON(PageActive(page));
+ if (!reclaim_active)
+ BUG_ON(PageActive(page));
+ else
+ BUG_ON(!PageActive(page));
sc->nr_scanned++;
/* Double the slab pressure for mapped and swapcache pages */
@@ -414,7 +427,8 @@ static int shrink_list(struct list_head
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && !PageSwapCache(page) &&
+ sc->may_swap) {
void *cookie = page->mapping;
pgoff_t index = page->index;
@@ -431,7 +445,7 @@ static int shrink_list(struct list_head
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
+ if (page_mapped(page) && mapping && reclaim_mapped) {
switch (try_to_unmap(page)) {
case SWAP_FAIL:
goto activate_locked;
@@ -537,6 +551,8 @@ static int shrink_list(struct list_head
__put_page(page);
free_it:
+ /* Clear the active bit before freeing the page */
+ ClearPageActive(page);
unlock_page(page);
reclaimed++;
if (!pagevec_add(&freed_pvec, page))
@@ -544,8 +560,10 @@ free_it:
continue;
activate_locked:
- SetPageActive(page);
- pgactivate++;
+ if (!reclaim_active) {
+ SetPageActive(page);
+ pgactivate++;
+ }
keep_locked:
unlock_page(page);
keep:
@@ -705,7 +723,7 @@ static void shrink_cache(struct zone *zo
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
-static void
+static int
refill_inactive_zone(struct zone *zone, struct scan_control *sc)
{
int pgmoved;
@@ -721,6 +739,7 @@ refill_inactive_zone(struct zone *zone,
long mapped_ratio;
long distress;
long swap_tendency;
+ int ret;
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
@@ -801,6 +820,7 @@ refill_inactive_zone(struct zone *zone,
}
zone->nr_inactive += pgmoved;
pgdeactivate += pgmoved;
+ ret = pgmoved;
if (buffer_heads_over_limit) {
spin_unlock_irq(&zone->lru_lock);
pagevec_strip(&pvec);
@@ -830,6 +850,8 @@ refill_inactive_zone(struct zone *zone,
mod_page_state_zone(zone, pgrefill, pgscanned);
mod_page_state(pgdeactivate, pgdeactivate);
+
+ return ret;
}
/*
@@ -916,7 +938,8 @@ shrink_caches(struct zone **zones, struc
if (zone->prev_priority > sc->priority)
zone->prev_priority = sc->priority;
- if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+ if (zone->unreclaimable == ALL_UNRECL &&
+ sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
shrink_zone(zone, sc);
@@ -949,6 +972,8 @@ int try_to_free_pages(struct zone **zone
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
+ sc.may_swap = 1;
+ sc.reclaim_flags = RECLAIM_UNMAPPED | RECLAIM_MAPPED;
inc_page_state(allocstall);
@@ -1049,6 +1074,8 @@ loop_again:
total_reclaimed = 0;
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
+ sc.may_swap = 1;
+ sc.reclaim_flags = RECLAIM_UNMAPPED | RECLAIM_MAPPED;
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
@@ -1076,8 +1103,8 @@ loop_again:
if (zone->present_pages == 0)
continue;
- if (zone->all_unreclaimable &&
- priority != DEF_PRIORITY)
+ if (zone->unreclaimable == ALL_UNRECL &&
+ priority != DEF_PRIORITY)
continue;
if (!zone_watermark_ok(zone, order,
@@ -1113,7 +1140,8 @@ scan:
if (zone->present_pages == 0)
continue;
- if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ if (zone->unreclaimable == ALL_UNRECL &&
+ priority != DEF_PRIORITY)
continue;
if (nr_pages == 0) { /* Not software suspend */
@@ -1135,11 +1163,11 @@ scan:
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
total_scanned += sc.nr_scanned;
- if (zone->all_unreclaimable)
+ if (zone->unreclaimable == ALL_UNRECL)
continue;
if (nr_slab == 0 && zone->pages_scanned >=
(zone->nr_active + zone->nr_inactive) * 4)
- zone->all_unreclaimable = 1;
+ zone->unreclaimable = ALL_UNRECL;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -1340,3 +1368,136 @@ static int __init kswapd_init(void)
}
module_init(kswapd_init)
+
+/* How many pages are processed at a time. */
+#define MIN_RECLAIM 32
+#define MAX_BATCH_SIZE 128
+#define UNRECLAIMABLE_TIMEOUT 5
+
+unsigned int reclaim_clean_pages(struct zone *zone, long pages, int flags)
+{
+ int batch_size;
+ unsigned int total_reclaimed = 0;
+ LIST_HEAD(page_list);
+ struct scan_control sc;
+ int max_scan;
+ int manual = flags & RECLAIM_MANUAL;
+
+ /* Zone is marked dead */
+ if (zone->unreclaimable & CLEAN_UNRECL && !manual)
+ return 0;
+
+ /* We don't really want to call this too often */
+ if (get_jiffies_64() < zone->reclaim_timeout) {
+ /* check for jiffies overflow -- needed? */
+ if (zone->reclaim_timeout - get_jiffies_64() >
+ UNRECLAIMABLE_TIMEOUT)
+ zone->reclaim_timeout = get_jiffies_64();
+ else if (!manual)
+ return 0;
+ }
+
+ /*
+ * Only one reclaimer scanning the zone at a time.
+ * Lie a bit with the return value, since another thread
+ * is in the process of reclaiming pages.
+ */
+ if (!manual && atomic_inc_and_test(&zone->reclaim_count))
+ return 1;
+
+ /* Don't go into the filesystem during this page freeing attempt */
+ sc.gfp_mask = 0;
+ sc.may_writepage = 0;
+ sc.may_swap = 0;
+ sc.reclaim_flags = flags;
+
+ /* make it worth our while to take the LRU lock */
+ if (pages < MIN_RECLAIM)
+ pages = MIN_RECLAIM;
+
+ /*
+ * Also don't take too many pages at a time,
+ * which can lead to a big overshoot in the
+ * number of pages that are freed.
+ */
+ if (pages > MAX_BATCH_SIZE)
+ batch_size = MAX_BATCH_SIZE;
+ else
+ batch_size = pages;
+
+ if (flags & (RECLAIM_UNMAPPED | RECLAIM_MAPPED)) {
+ /* Doing inactive. Clear the active flags for now. */
+ sc.reclaim_flags &= ~(RECLAIM_ACTIVE_UNMAPPED |
+ RECLAIM_ACTIVE_MAPPED);
+
+ /* Not an exact count, but close enough */
+ max_scan = zone->nr_inactive;
+
+ while (pages > 0 && max_scan > 0) {
+ int moved = 0;
+ int reclaimed = 0;
+ int scanned;
+
+ spin_lock_irq(&zone->lru_lock);
+ moved = isolate_lru_pages(batch_size,
+ &zone->inactive_list,
+ &page_list, &scanned);
+ zone->nr_inactive -= moved;
+ spin_unlock_irq(&zone->lru_lock);
+ max_scan -= moved;
+
+ reclaimed = shrink_list(&page_list, &sc);
+
+ /* Put back the unfreeable pages */
+ spin_lock_irq(&zone->lru_lock);
+ merge_lru_pages(zone, &page_list);
+ spin_unlock_irq(&zone->lru_lock);
+
+ total_reclaimed += reclaimed;
+ pages -= reclaimed;
+ }
+ }
+
+ if (flags & (RECLAIM_ACTIVE_UNMAPPED | RECLAIM_ACTIVE_MAPPED)) {
+ /* Get flags for scan_control again, in case they were
+ * cleared while doing inactive reclaim
+ */
+ sc.reclaim_flags = flags;
+
+ max_scan = zone->nr_active;
+ while (pages > 0 && max_scan > 0) {
+ int moved = 0;
+ int reclaimed = 0;
+ int scanned;
+
+ spin_lock_irq(&zone->lru_lock);
+ moved = isolate_lru_pages(batch_size,
+ &zone->active_list,
+ &page_list, &scanned);
+ zone->nr_active -= moved;
+ spin_unlock_irq(&zone->lru_lock);
+ max_scan -= moved;
+
+ reclaimed = shrink_list(&page_list, &sc);
+
+ /* Put back the unfreeable pages */
+ spin_lock_irq(&zone->lru_lock);
+ merge_lru_pages(zone, &page_list);
+ spin_unlock_irq(&zone->lru_lock);
+
+ total_reclaimed += reclaimed;
+ pages -= reclaimed;
+ }
+ }
+
+ /* The goal wasn't met */
+ if (pages > 0) {
+ zone->reclaim_timeout = get_jiffies_64() +
+ UNRECLAIMABLE_TIMEOUT;
+ zone->unreclaimable |= CLEAN_UNRECL;
+ }
+
+ atomic_set(&zone->reclaim_count, -1);
+
+ return total_reclaimed;
+}
Index: linux-2.6.12-rc2.wk/include/linux/swap.h
===================================================================
--- linux-2.6.12-rc2.wk.orig/include/linux/swap.h 2005-04-27 06:56:48.000000000 -0700
+++ linux-2.6.12-rc2.wk/include/linux/swap.h 2005-04-27 06:56:57.000000000 -0700
@@ -144,6 +144,20 @@ struct swap_list_t {
int next; /* swapfile to be used next */
};
+/* Page cache reclaim definitions */
+#define RECLAIM_UNMAPPED (1<<0)
+#define RECLAIM_MAPPED (1<<1)
+#define RECLAIM_ACTIVE_UNMAPPED (1<<2)
+#define RECLAIM_ACTIVE_MAPPED (1<<3)
+#define RECLAIM_SLAB (1<<4)
+#define RECLAIM_MANUAL (1<<5)
+#define RECLAIM_MASK ~(RECLAIM_UNMAPPED | \
+ RECLAIM_MAPPED | \
+ RECLAIM_ACTIVE_UNMAPPED | \
+ RECLAIM_ACTIVE_MAPPED | \
+ RECLAIM_SLAB | \
+ RECLAIM_MANUAL)
+
/* Swap 50% full? Release swapcache more aggressively.. */
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
@@ -174,6 +188,7 @@ extern void swap_setup(void);
/* linux/mm/vmscan.c */
extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
extern int shrink_all_memory(int);
+extern unsigned int reclaim_clean_pages(struct zone *, long, int);
extern int vm_swappiness;
#ifdef CONFIG_MMU
Index: linux-2.6.12-rc2.wk/mm/page_alloc.c
===================================================================
--- linux-2.6.12-rc2.wk.orig/mm/page_alloc.c 2005-04-27 06:56:48.000000000 -0700
+++ linux-2.6.12-rc2.wk/mm/page_alloc.c 2005-04-27 06:56:57.000000000 -0700
@@ -347,7 +347,7 @@ free_pages_bulk(struct zone *zone, int c
int ret = 0;
spin_lock_irqsave(&zone->lock, flags);
- zone->all_unreclaimable = 0;
+ zone->unreclaimable = 0;
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
page = list_entry(list->prev, struct page, lru);
@@ -1328,7 +1328,7 @@ void show_free_areas(void)
" inactive:%lukB"
" present:%lukB"
" pages_scanned:%lu"
- " all_unreclaimable? %s"
+ " unreclaimable: %d"
"\n",
zone->name,
K(zone->free_pages),
@@ -1339,7 +1339,7 @@ void show_free_areas(void)
K(zone->nr_inactive),
K(zone->present_pages),
zone->pages_scanned,
- (zone->all_unreclaimable ? "yes" : "no")
+ zone->unreclaimable
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
@@ -1751,6 +1751,8 @@ static void __init free_area_init_core(s
zone->nr_scan_inactive = 0;
zone->nr_active = 0;
zone->nr_inactive = 0;
+ zone->reclaim_count = ATOMIC_INIT(-1);
+ zone->reclaim_timeout = get_jiffies_64();
if (!size)
continue;
Index: linux-2.6.12-rc2.wk/include/linux/mmzone.h
===================================================================
--- linux-2.6.12-rc2.wk.orig/include/linux/mmzone.h 2005-04-27 06:56:48.000000000 -0700
+++ linux-2.6.12-rc2.wk/include/linux/mmzone.h 2005-04-27 06:56:57.000000000 -0700
@@ -29,6 +29,15 @@ struct free_area {
struct pglist_data;
/*
+ * Information about reclaimability of a zone's pages.
+ * After we have scanned a zone and determined that there
+ * are no other pages to free of a certain type we can
+ * stop scanning it
+ */
+#define CLEAN_UNRECL 0x1
+#define ALL_UNRECL 0x3
+
+/*
* zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate
* cachelines. There are very few zone structures in the machine, so space
@@ -142,7 +151,9 @@ struct zone {
unsigned long nr_active;
unsigned long nr_inactive;
unsigned long pages_scanned; /* since last reclaim */
- int all_unreclaimable; /* All pages pinned */
+ int unreclaimable; /* pinned pages marker */
+ atomic_t reclaim_count;
+ unsigned long reclaim_timeout;
/*
* prev_priority holds the scanning priority for this zone. It is
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH/RFC 4/4] VM: automatic reclaim through mempolicy
[not found] <20050427145734.GL8018@localhost>
` (2 preceding siblings ...)
2005-04-27 15:09 ` [PATCH/RFC 3/4] VM: toss_page_cache_node syscall Martin Hicks
@ 2005-04-27 15:10 ` Martin Hicks
2005-04-27 23:35 ` Andrew Morton
2005-04-27 23:50 ` Andrew Morton
3 siblings, 2 replies; 10+ messages in thread
From: Martin Hicks @ 2005-04-27 15:10 UTC (permalink / raw)
To: Andrew Morton, Linux-MM; +Cc: Ray Bryant, ak
This implements a set of flags that modify the behavior
of the the mempolicies to allow reclaiming of preferred
memory (as definited by the mempolicy) before spilling
onto remote nodes. It also adds a new mempolicy
"localreclaim" which is just the default mempolicy with
non-zero reclaim flags.
The change required adding a "flags" argument to sys_set_mempolicy()
to give hints about what kind of memory you're willing to sacrifice.
A patch for numactl-0.6.4 to support these new flags is at
http://www.bork.org/~mort/sgi/localreclaim/numactl-localreclaim.patch
This patch breaks compatibility, but I just needed something to
test with. I did update the numactl's usage message with the
new bits. Essentially just add "--localreclaim=[umUM]" to get
the allocator to use localreclaim.
I'm sure that better tuning of the rate-limiting code in
vmscan.c::reclaim_clean_pages() could help performance further,
but at this stage I was fairly happy to keep the system time
at a reasonable level. The obvious difficulty with this patch
is to ensure that it doesn't scan the LRU lists to death, looking
for those non-existant clean pages.
Here are some kernbench runs that show that things don't get out of
control under heavy VM pressure. I think kernbench's "Maximal" run
is a fairly stressful test for this code because it allocates all
of the memory out of the system and still must do disk IO during
the compiles.
I haven't yet had time to do a run in a situation where I think the
patches will make a real difference. I'm going to do some runs
with a big HPC app this week.
The test machine was a 4-way 8GB Altix. The "minimal" (make -j3) and
"optimal" (make -j16) results are uninteresting. All three runs
show almost exactly the same results because we never actually invoke
any of this new code. There is no VM pressure.
Wall User System %CPU Ctx Sw Sleeps
----- ---- ------ ---- ------ ------
2.6.12-rc2-mm2 1296 1375 387 160 252333 388268
noreclaim 1111 1370 319 195 216259 318279
reclaim=um 1251 1373 312 160 223148 371875
This is just the average of two runs. There seems to be large
variance in the first two, but the reclaim=um run is quite
consistent.
2.6.12-rc2-mm2 is kernbench run on a pristine tree.
noreclaim is with the patches, but no use of numactl.
reclaim=um is kernbench invoked with:
./numactl --localreclaim=um ../kernbench-0.3.0/kernbench
Signed-off-by: Marting Hicks <mort@sgi.com>
---
include/linux/gfp.h | 3 +
include/linux/mempolicy.h | 33 ++++++++++++++---
mm/mempolicy.c | 68 ++++++++++++++++++++++++++---------
mm/page_alloc.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 168 insertions(+), 23 deletions(-)
Index: linux-2.6.12-rc2.wk/mm/mempolicy.c
===================================================================
--- linux-2.6.12-rc2.wk.orig/mm/mempolicy.c 2005-04-27 06:27:38.000000000 -0700
+++ linux-2.6.12-rc2.wk/mm/mempolicy.c 2005-04-27 07:09:09.000000000 -0700
@@ -19,7 +19,7 @@
* is used.
* bind Only allocate memory on a specific set of nodes,
* no fallback.
- * preferred Try a specific node first before normal fallback.
+ * preferred Try a specific node first before normal fallback.
* As a special case node -1 here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
@@ -27,6 +27,9 @@
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
+ * localreclaim This is a special case of default. The allocator
+ * will try very hard to get a local allocation. It
+ * invokes page cache cleaners and slab cleaners.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
@@ -113,6 +116,7 @@ static int mpol_check_policy(int mode, u
switch (mode) {
case MPOL_DEFAULT:
+ case MPOL_LOCALRECLAIM:
if (!empty)
return -EINVAL;
break;
@@ -205,13 +209,19 @@ static struct zonelist *bind_zonelist(un
}
/* Create a new policy */
-static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
+static struct mempolicy *mpol_new(int mode, unsigned long *nodes,
+ unsigned int flags)
{
struct mempolicy *policy;
+ int mpol_flags = mpol_to_reclaim_flags(flags);
PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
- if (mode == MPOL_DEFAULT)
- return NULL;
+ if (mode == MPOL_DEFAULT) {
+ if (!flags)
+ return NULL;
+ else
+ mode = MPOL_LOCALRECLAIM;
+ }
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
@@ -234,6 +244,7 @@ static struct mempolicy *mpol_new(int mo
break;
}
policy->policy = mode;
+ policy->flags = mpol_flags;
return policy;
}
@@ -384,7 +395,7 @@ asmlinkage long sys_mbind(unsigned long
if (err)
return err;
- new = mpol_new(mode, nodes);
+ new = mpol_new(mode, nodes, flags);
if (IS_ERR(new))
return PTR_ERR(new);
@@ -403,7 +414,7 @@ asmlinkage long sys_mbind(unsigned long
/* Set the process memory policy */
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+ unsigned long maxnode, int flags)
{
int err;
struct mempolicy *new;
@@ -411,10 +422,12 @@ asmlinkage long sys_set_mempolicy(int mo
if (mode > MPOL_MAX)
return -EINVAL;
+ if (flags & MPOL_FLAG_MASK)
+ return -EINVAL;
err = get_nodes(nodes, nmask, maxnode, mode);
if (err)
return err;
- new = mpol_new(mode, nodes);
+ new = mpol_new(mode, nodes, flags);
if (IS_ERR(new))
return PTR_ERR(new);
mpol_free(current->mempolicy);
@@ -436,6 +449,7 @@ static void get_zonemask(struct mempolic
__set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
break;
case MPOL_DEFAULT:
+ case MPOL_LOCALRECLAIM:
break;
case MPOL_INTERLEAVE:
bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
@@ -600,7 +614,7 @@ asmlinkage long compat_sys_set_mempolicy
if (err)
return -EFAULT;
- return sys_set_mempolicy(mode, nm, nr_bits+1);
+ return sys_set_mempolicy(mode, nm, nr_bits+1, 0);
}
asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
@@ -666,6 +680,7 @@ static struct zonelist *zonelist_policy(
return policy->v.zonelist;
/*FALL THROUGH*/
case MPOL_INTERLEAVE: /* should not happen */
+ case MPOL_LOCALRECLAIM:
case MPOL_DEFAULT:
nd = numa_node_id();
break;
@@ -712,14 +727,17 @@ static unsigned offset_il_node(struct me
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid, int flags)
{
struct zonelist *zl;
struct page *page;
BUG_ON(!node_online(nid));
zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
- page = __alloc_pages(gfp, order, zl);
+ if (flags)
+ page = __alloc_pages_localreclaim(gfp, order, zl, flags);
+ else
+ page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
zl->zones[0]->pageset[get_cpu()].interleave_hit++;
put_cpu();
@@ -769,8 +787,12 @@ alloc_page_vma(unsigned int __nocast gfp
/* fall back to process interleaving */
nid = interleave_nodes(pol);
}
- return alloc_page_interleave(gfp, 0, nid);
+ return alloc_page_interleave(gfp, 0, nid, pol->flags);
}
+
+ if (pol->flags)
+ return __alloc_pages_localreclaim(gfp, 0,
+ zonelist_policy(gfp, pol), pol->flags);
return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
}
@@ -802,7 +824,11 @@ struct page *alloc_pages_current(unsigne
if (!pol || in_interrupt())
pol = &default_policy;
if (pol->policy == MPOL_INTERLEAVE)
- return alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ return alloc_page_interleave(gfp, order, interleave_nodes(pol),
+ pol->flags);
+ if (pol->flags)
+ return __alloc_pages_localreclaim(gfp, order,
+ zonelist_policy(gfp, pol), pol->flags);
return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -831,23 +857,29 @@ struct mempolicy *__mpol_copy(struct mem
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
+ int flags;
+
if (!a || !b)
return 0;
if (a->policy != b->policy)
return 0;
+ flags = a->flags == b->flags;
switch (a->policy) {
case MPOL_DEFAULT:
return 1;
+ case MPOL_LOCALRECLAIM:
+ return a->flags == b->flags;
case MPOL_INTERLEAVE:
- return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
+ return flags && bitmap_equal(a->v.nodes, b->v.nodes,
+ MAX_NUMNODES);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node;
+ return flags && a->v.preferred_node == b->v.preferred_node;
case MPOL_BIND: {
int i;
for (i = 0; a->v.zonelist->zones[i]; i++)
if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
return 0;
- return b->v.zonelist->zones[i] == NULL;
+ return flags && b->v.zonelist->zones[i] == NULL;
}
default:
BUG();
@@ -878,6 +910,7 @@ int mpol_first_node(struct vm_area_struc
switch (pol->policy) {
case MPOL_DEFAULT:
+ case MPOL_LOCALRECLAIM:
return numa_node_id();
case MPOL_BIND:
return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
@@ -900,6 +933,7 @@ int mpol_node_valid(int nid, struct vm_a
case MPOL_PREFERRED:
case MPOL_DEFAULT:
case MPOL_INTERLEAVE:
+ case MPOL_LOCALRECLAIM:
return 1;
case MPOL_BIND: {
struct zone **z;
@@ -1126,7 +1160,7 @@ void __init numa_policy_init(void)
the data structures allocated at system boot end up in node zero. */
if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
- MAX_NUMNODES) < 0)
+ MAX_NUMNODES, 0) < 0)
printk("numa_policy_init: interleaving failed\n");
}
@@ -1134,5 +1168,5 @@ void __init numa_policy_init(void)
* Assumes fs == KERNEL_DS */
void numa_default_policy(void)
{
- sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+ sys_set_mempolicy(MPOL_DEFAULT, NULL, 0, 0);
}
Index: linux-2.6.12-rc2.wk/include/linux/gfp.h
===================================================================
--- linux-2.6.12-rc2.wk.orig/include/linux/gfp.h 2005-04-27 06:27:38.000000000 -0700
+++ linux-2.6.12-rc2.wk/include/linux/gfp.h 2005-04-27 07:09:09.000000000 -0700
@@ -81,6 +81,9 @@ static inline void arch_free_page(struct
extern struct page *
FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *));
+extern struct page *
+FASTCALL(__alloc_pages_localreclaim(unsigned int, unsigned int,
+ struct zonelist *, int));
static inline struct page *alloc_pages_node(int nid, unsigned int __nocast gfp_mask,
unsigned int order)
Index: linux-2.6.12-rc2.wk/mm/page_alloc.c
===================================================================
--- linux-2.6.12-rc2.wk.orig/mm/page_alloc.c 2005-04-27 06:56:57.000000000 -0700
+++ linux-2.6.12-rc2.wk/mm/page_alloc.c 2005-04-27 07:09:09.000000000 -0700
@@ -958,6 +958,93 @@ got_pg:
EXPORT_SYMBOL(__alloc_pages);
+#ifdef CONFIG_NUMA
+
+/*
+ * A function that tries to allocate memory from the local
+ * node by trying really hard, including trying to free up
+ * easily-freed memory from the page cache and (perhaps in the
+ * future) the slab
+ */
+struct page * fastcall
+__alloc_pages_localreclaim(unsigned int gfp_mask, unsigned int order,
+ struct zonelist *zonelist, int flags)
+{
+ struct zone **zones, *z;
+ struct page *page = NULL;
+ int classzone_idx;
+ int i;
+
+ /*
+ * Never try local reclaim with GFP_ATOMIC and friends, because
+ * this path might sleep.
+ */
+ if (!(gfp_mask & __GFP_WAIT))
+ return __alloc_pages(gfp_mask, order, zonelist);
+
+ zones = zonelist->zones;
+ if (unlikely(zones[0] == NULL))
+ return NULL;
+
+ classzone_idx = zone_idx(zones[0]);
+
+ /*
+ * Go through the zonelist once, looking for a local zone
+ * with enough free memory.
+ */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ if (NODE_DATA(numa_node_id()) != z->zone_pgdat)
+ continue;
+ if (!cpuset_zone_allowed(z))
+ continue;
+
+ if (zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, 0, 0)) {
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+ }
+
+ /* Go through again trying to free memory from the zone */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ if (NODE_DATA(numa_node_id()) != z->zone_pgdat)
+ continue;
+ if (!cpuset_zone_allowed(z))
+ continue;
+
+ while (reclaim_clean_pages(z, 1<<order, flags)) {
+ if (zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, 0, 0)) {
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ goto got_pg;
+ }
+ }
+ }
+
+ /* Didn't get a local page - invoke the normal allocator */
+ return __alloc_pages(gfp_mask, order, zonelist);
+ got_pg:
+
+#ifdef CONFIG_PAGE_OWNER /* huga... */
+ {
+ unsigned long address, bp;
+#ifdef X86_64
+ asm ("movq %%rbp, %0" : "=r" (bp) : );
+#else
+ asm ("movl %%ebp, %0" : "=r" (bp) : );
+#endif
+ page->order = (int) order;
+ __stack_trace(page, &address, bp);
+ }
+#endif /* CONFIG_PAGE_OWNER */
+ zone_statistics(zonelist, z);
+ return page;
+}
+
+#endif /* CONFIG_NUMA */
+
/*
* Common helper functions.
*/
Index: linux-2.6.12-rc2.wk/include/linux/mempolicy.h
===================================================================
--- linux-2.6.12-rc2.wk.orig/include/linux/mempolicy.h 2005-04-27 06:27:38.000000000 -0700
+++ linux-2.6.12-rc2.wk/include/linux/mempolicy.h 2005-04-27 07:09:09.000000000 -0700
@@ -2,6 +2,7 @@
#define _LINUX_MEMPOLICY_H 1
#include <linux/errno.h>
+#include <linux/swap.h>
/*
* NUMA memory policies for Linux.
@@ -9,19 +10,38 @@
*/
/* Policies */
-#define MPOL_DEFAULT 0
-#define MPOL_PREFERRED 1
-#define MPOL_BIND 2
-#define MPOL_INTERLEAVE 3
+#define MPOL_DEFAULT 0
+#define MPOL_PREFERRED 1
+#define MPOL_BIND 2
+#define MPOL_INTERLEAVE 3
+#define MPOL_LOCALRECLAIM 4
-#define MPOL_MAX MPOL_INTERLEAVE
+#define MPOL_MAX MPOL_LOCALRECLAIM
/* Flags for get_mem_policy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
/* Flags for mbind */
-#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
+#define MPOL_MF_STRICT (1<<2) /* Verify existing pages in the mapping */
+
+/* Flags for set_mempolicy */
+#define mpol_reclaim_shift(x) ((x)<<3)
+#define MPOL_LR_UNMAPPED mpol_reclaim_shift(RECLAIM_UNMAPPED)
+#define MPOL_LR_MAPPED mpol_reclaim_shift(RECLAIM_MAPPED)
+#define MPOL_LR_ACTIVE_UNMAPPED mpol_reclaim_shift(RECLAIM_ACTIVE_UNMAPPED)
+#define MPOL_LR_ACTIVE_MAPPED mpol_reclaim_shift(RECLAIM_ACTIVE_MAPPED)
+#define MPOL_LR_SLAB mpol_reclaim_shift(RECLAIM_SLAB)
+
+#define MPOL_LR_FLAGS (MPOL_LR_UNMAPPED | MPOL_LR_MAPPED | \
+ MPOL_LR_ACTIVE_MAPPED | MPOL_LR_ACTIVE_UNMAPPED | \
+ MPOL_LR_SLAB)
+#define MPOL_LR_MASK ~MPOL_LR_FLAGS
+#define MPOL_FLAGS (MPOL_F_NODE | MPOL_F_ADDR | MPOL_MF_STRICT | \
+ MPOL_LR_FLAGS)
+#define MPOL_FLAG_MASK ~MPOL_FLAGS
+#define mpol_to_reclaim_flags(flags) ((flags & MPOL_LR_FLAGS) >> 3)
+
#ifdef __KERNEL__
@@ -60,6 +80,7 @@ struct vm_area_struct;
struct mempolicy {
atomic_t refcnt;
short policy; /* See MPOL_* above */
+ int flags;
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread