From: dave@sr71.net
To: linux-mm@kvack.org
Cc: balbir@in.ibm.com, dave@sr71.net
Subject: [RFC][PATCH] "challenged" memory controller
Date: Tue, 15 Aug 2006 12:20:47 -0700 [thread overview]
Message-ID: <20060815192047.EE4A0960@localhost.localdomain> (raw)
I've been toying with a little memory controller for the past
few weeks, on and off. My goal was to create something simple
and hackish that would at least be a toy to play with in the
process of creating something that might actually be feasible.
I call it "challenged" because it has some definite limitations.
However, it only adds about 50 lines of code to generic areas
of the VM, and I haven't been the slightest bit careful, yet.
I think it probably also breaks CONFIG_PM and !CONFIG_CPUSETS,
but those are "features". ;)
It uses cpusets for now, just because they are there, and are
relatively easy to modify. The page->cpuset bit is only
temporary, and I have some plans to remove it later.
How does it work? It adds two fields to the scan control
structure. One that tells the scan to only pay attention to
_any_ cpuset over its memory limits, and the other to tell it
to only scan pages for a _particular_ cpuset.
I've been pretty indiscriminately hacking away, so I have the
feeling that there are some more efficient and nicer ways to
hook into the page scanning logic. Comments are very welcome.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---
lxc-dave/include/linux/cpuset.h | 4 +
lxc-dave/include/linux/gfp.h | 4 -
lxc-dave/include/linux/swap.h | 2
lxc-dave/kernel/cpuset.c | 99 +++++++++++++++++++++++++++++++++++++++-
lxc-dave/mm/page_alloc.c | 12 ++++
lxc-dave/mm/rmap.c | 5 ++
lxc-dave/mm/vmscan.c | 33 +++++++++++--
7 files changed, 149 insertions(+), 10 deletions(-)
diff -puN include/linux/cpuset.h~challenged-memory-controller include/linux/cpuset.h
--- lxc/include/linux/cpuset.h~challenged-memory-controller 2006-08-14 13:22:12.000000000 -0700
+++ lxc-dave/include/linux/cpuset.h 2006-08-15 07:58:15.000000000 -0700
@@ -127,5 +127,9 @@ static inline int cpuset_do_slab_mem_spr
}
#endif /* !CONFIG_CPUSETS */
+int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask);
+void cpuset_dec_nr_pages(struct cpuset *cs, int nr);
+int cpuset_get_nr_pages(const struct cpuset *cs);
+int cpuset_amount_over_memory_max(const struct cpuset *cs);
#endif /* _LINUX_CPUSET_H */
diff -puN include/linux/gfp.h~challenged-memory-controller include/linux/gfp.h
--- lxc/include/linux/gfp.h~challenged-memory-controller 2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/include/linux/gfp.h 2006-08-15 07:47:34.000000000 -0700
@@ -108,10 +108,6 @@ static inline enum zone_type gfp_zone(gf
* optimized to &contig_page_data at compile-time.
*/
-#ifndef HAVE_ARCH_FREE_PAGE
-static inline void arch_free_page(struct page *page, int order) { }
-#endif
-
extern struct page *
FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
diff -puN include/linux/mm.h~challenged-memory-controller include/linux/mm.h
diff -puN include/linux/sched.h~challenged-memory-controller include/linux/sched.h
diff -puN include/linux/swap.h~challenged-memory-controller include/linux/swap.h
--- lxc/include/linux/swap.h~challenged-memory-controller 2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/include/linux/swap.h 2006-08-15 07:47:34.000000000 -0700
@@ -188,7 +188,7 @@ extern void swap_setup(void);
/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zone **, gfp_t);
-extern unsigned long shrink_all_memory(unsigned long nr_pages);
+extern unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs);
extern int vm_swappiness;
extern int remove_mapping(struct address_space *mapping, struct page *page);
extern long vm_total_pages;
diff -puN kernel/cpuset.c~challenged-memory-controller kernel/cpuset.c
--- lxc/kernel/cpuset.c~challenged-memory-controller 2006-08-14 13:22:12.000000000 -0700
+++ lxc-dave/kernel/cpuset.c 2006-08-15 08:00:40.000000000 -0700
@@ -21,6 +21,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
+#include <linux/delay.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/file.h>
@@ -46,6 +47,7 @@
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
+#include <linux/swap.h>
#include <linux/time.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
@@ -97,6 +99,8 @@ struct cpuset {
* recent time this cpuset changed its mems_allowed.
*/
int mems_generation;
+ int mems_nr_pages;
+ int mems_max_pages;
struct fmeter fmeter; /* memory_pressure filter */
};
@@ -112,6 +116,55 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
+int shrink_cpuset(struct cpuset *cs, gfp_t gfpmask, int tries)
+{
+ int nr_shrunk = 0;
+ while (cpuset_amount_over_memory_max(cs)) {
+ if (tries-- < 0)
+ break;
+ nr_shrunk += shrink_all_memory(10, cs);
+ }
+ return 0;
+}
+
+int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask)
+{
+ int ret;
+ if (!cs)
+ return 0;
+ cs->mems_nr_pages += nr;
+ if (cpuset_amount_over_memory_max(cs)) {
+ if (!(gfpmask & __GFP_WAIT))
+ return -ENOMEM;
+ ret = shrink_cpuset(cs, gfpmask, 50);
+ }
+ if (cpuset_amount_over_memory_max(cs))
+ return -ENOMEM;
+ return 0;
+}
+void cpuset_dec_nr_pages(struct cpuset *cs, int nr)
+{
+ if (!cs)
+ return;
+ cs->mems_nr_pages -= nr;
+}
+int cpuset_get_nr_pages(const struct cpuset *cs)
+{
+ return cs->mems_nr_pages;
+}
+int cpuset_amount_over_memory_max(const struct cpuset *cs)
+{
+ int amount;
+
+ if (!cs || cs->mems_max_pages < 0)
+ return 0;
+ amount = cs->mems_nr_pages - cs->mems_max_pages;
+ if (amount < 0)
+ amount = 0;
+ return amount;
+}
+
+
/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
@@ -173,6 +226,8 @@ static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
.cpus_allowed = CPU_MASK_ALL,
.mems_allowed = NODE_MASK_ALL,
+ .mems_nr_pages = 0,
+ .mems_max_pages = -1,
.count = ATOMIC_INIT(0),
.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
.children = LIST_HEAD_INIT(top_cpuset.children),
@@ -1021,6 +1076,17 @@ static int update_memory_pressure_enable
return 0;
}
+static int update_memory_max_nr_pages(struct cpuset *cs, char *buf)
+{
+ int rate = simple_strtol(buf, NULL, 10);
+ int shrunk;
+ int loopnr = 0;
+ cs->mems_max_pages = rate;
+ while (cpuset_amount_over_memory_max(cs))
+ shrunk = shrink_cpuset(cs, 0, 10);
+ return 0;
+}
+
/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1109,6 +1175,7 @@ static int update_flag(cpuset_flagbits_t
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
+#define FM_COEF 93 /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
@@ -1263,6 +1330,8 @@ typedef enum {
FILE_NOTIFY_ON_RELEASE,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
+ FILE_MEMORY_MAX,
+ FILE_MEMORY_USED,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
FILE_TASKLIST,
@@ -1321,6 +1390,9 @@ static ssize_t cpuset_common_file_write(
case FILE_MEMORY_PRESSURE_ENABLED:
retval = update_memory_pressure_enabled(cs, buffer);
break;
+ case FILE_MEMORY_MAX:
+ retval = update_memory_max_nr_pages(cs, buffer);
+ break;
case FILE_MEMORY_PRESSURE:
retval = -EACCES;
break;
@@ -1441,6 +1513,12 @@ static ssize_t cpuset_common_file_read(s
case FILE_MEMORY_PRESSURE:
s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
break;
+ case FILE_MEMORY_MAX:
+ s += sprintf(s, "%d", cs->mems_max_pages);
+ break;
+ case FILE_MEMORY_USED:
+ s += sprintf(s, "%d", cs->mems_nr_pages);
+ break;
case FILE_SPREAD_PAGE:
*s++ = is_spread_page(cs) ? '1' : '0';
break;
@@ -1785,6 +1863,16 @@ static struct cftype cft_cpu_exclusive =
.private = FILE_CPU_EXCLUSIVE,
};
+static struct cftype cft_mem_used = {
+ .name = "memory_nr_pages",
+ .private = FILE_MEMORY_USED,
+};
+
+static struct cftype cft_mem_max = {
+ .name = "memory_max_pages",
+ .private = FILE_MEMORY_MAX,
+};
+
static struct cftype cft_mem_exclusive = {
.name = "mem_exclusive",
.private = FILE_MEM_EXCLUSIVE,
@@ -1830,6 +1918,10 @@ static int cpuset_populate_dir(struct de
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_mem_max)) < 0)
+ return err;
+ if ((err = cpuset_add_file(cs_dentry, &cft_mem_used)) < 0)
+ return err;
if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
@@ -1880,6 +1972,8 @@ static long cpuset_create(struct cpuset
INIT_LIST_HEAD(&cs->sibling);
INIT_LIST_HEAD(&cs->children);
cs->mems_generation = cpuset_mems_generation++;
+ cs->mems_max_pages = parent->mems_max_pages;
+ cs->mems_nr_pages = 0;
fmeter_init(&cs->fmeter);
cs->parent = parent;
@@ -1986,6 +2080,8 @@ int __init cpuset_init_early(void)
tsk->cpuset = &top_cpuset;
tsk->cpuset->mems_generation = cpuset_mems_generation++;
+ tsk->cpuset->mems_max_pages = -1;
+ tsk->cpuset->mems_nr_pages = 0;
return 0;
}
@@ -2005,6 +2101,8 @@ int __init cpuset_init(void)
fmeter_init(&top_cpuset.fmeter);
top_cpuset.mems_generation = cpuset_mems_generation++;
+ top_cpuset.mems_max_pages = -1;
+ top_cpuset.mems_nr_pages = 0;
init_task.cpuset = &top_cpuset;
@@ -2438,7 +2536,6 @@ int cpuset_memory_pressure_enabled __rea
void __cpuset_memory_pressure_bump(void)
{
struct cpuset *cs;
-
task_lock(current);
cs = current->cpuset;
fmeter_markevent(&cs->fmeter);
diff -puN mm/page_alloc.c~challenged-memory-controller mm/page_alloc.c
--- lxc/mm/page_alloc.c~challenged-memory-controller 2006-08-14 13:24:16.000000000 -0700
+++ lxc-dave/mm/page_alloc.c 2006-08-15 07:57:13.000000000 -0700
@@ -470,6 +470,11 @@ static void free_one_page(struct zone *z
free_pages_bulk(zone, 1, &list, order);
}
+void arch_free_page(struct page *page, int order)
+{
+ cpuset_dec_nr_pages(page->cpuset, 1<<order);
+}
+
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
@@ -1020,6 +1025,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
might_sleep_if(wait);
+ if (cpuset_inc_nr_pages(current->cpuset, 1<<order, gfp_mask))
+ return NULL;
+
restart:
z = zonelist->zones; /* the list of zones suitable for gfp_mask */
@@ -1159,6 +1167,10 @@ got_pg:
if (page)
set_page_owner(page, order, gfp_mask);
#endif
+ if (page)
+ page->cpuset = current->cpuset;
+ else
+ cpuset_dec_nr_pages(current->cpuset, 1<<order);
return page;
}
diff -puN mm/rmap.c~challenged-memory-controller mm/rmap.c
--- lxc/mm/rmap.c~challenged-memory-controller 2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/mm/rmap.c 2006-08-15 08:01:26.000000000 -0700
@@ -927,3 +927,8 @@ int try_to_unmap(struct page *page, int
return ret;
}
+extern int cpuset_amount_over_memory_max(const struct cpuset *cs);
+int page_has_naughty_cpuset(struct page *page)
+{
+ return cpuset_amount_over_memory_max(page->cpuset);
+}
diff -puN mm/vmscan.c~challenged-memory-controller mm/vmscan.c
--- lxc/mm/vmscan.c~challenged-memory-controller 2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/mm/vmscan.c 2006-08-15 08:05:03.000000000 -0700
@@ -63,8 +63,9 @@ struct scan_control {
int swap_cluster_max;
int swappiness;
-
int all_unreclaimable;
+ int only_pages_with_naughty_cpusets;
+ struct cpuset *only_this_cpuset;
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -445,6 +446,10 @@ static unsigned long shrink_page_list(st
VM_BUG_ON(PageActive(page));
+ if (cpuset_amount_over_memory_max(sc->only_this_cpuset) &&
+ page->cpuset && page->cpuset != sc->only_this_cpuset) {
+ goto keep_locked;
+ }
sc->nr_scanned++;
if (!sc->may_swap && page_mapped(page))
@@ -793,9 +798,20 @@ force_reclaim_mapped:
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
+ extern int page_has_naughty_cpuset(struct page *page);
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
+ if (sc->only_this_cpuset &&
+ page->cpuset && page->cpuset != sc->only_this_cpuset) {
+ list_add(&page->lru, &l_active);
+ continue;
+ }
+ if (sc->only_pages_with_naughty_cpusets &&
+ !page_has_naughty_cpuset(page)) {
+ list_add(&page->lru, &l_active);
+ continue;
+ }
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
@@ -875,6 +891,7 @@ static unsigned long shrink_zone(int pri
unsigned long nr_inactive;
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
+ int nr_scans = 0;
atomic_inc(&zone->reclaim_in_progress);
@@ -897,6 +914,11 @@ static unsigned long shrink_zone(int pri
nr_inactive = 0;
while (nr_active || nr_inactive) {
+ nr_scans++;
+ if (printk_ratelimit())
+ printk("%s() scan nr: %d\n", __func__, nr_scans);
+ if (nr_scans > 20)
+ sc->only_pages_with_naughty_cpusets = 0;
if (nr_active) {
nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
@@ -993,6 +1015,7 @@ unsigned long try_to_free_pages(struct z
.swap_cluster_max = SWAP_CLUSTER_MAX,
.may_swap = 1,
.swappiness = vm_swappiness,
+ .only_pages_with_naughty_cpusets = 1,
};
delay_swap_prefetch();
@@ -1090,6 +1113,7 @@ static unsigned long balance_pgdat(pg_da
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
+ .only_pages_with_naughty_cpusets = 1,
};
loop_again:
@@ -1310,7 +1334,6 @@ void wakeup_kswapd(struct zone *zone, in
wake_up_interruptible(&pgdat->kswapd_wait);
}
-#ifdef CONFIG_PM
/*
* Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
* from LRU lists system-wide, for given pass and priority, and returns the
@@ -1363,7 +1386,7 @@ static unsigned long shrink_all_zones(un
* LRU order by reclaiming preferentially
* inactive > active > active referenced > active mapped
*/
-unsigned long shrink_all_memory(unsigned long nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs)
{
unsigned long lru_pages, nr_slab;
unsigned long ret = 0;
@@ -1376,6 +1399,8 @@ unsigned long shrink_all_memory(unsigned
.swap_cluster_max = nr_pages,
.may_writepage = 1,
.swappiness = vm_swappiness,
+ .only_pages_with_naughty_cpusets = 1,
+ .only_this_cpuset = cs,
};
delay_swap_prefetch();
@@ -1462,7 +1487,6 @@ out:
return ret;
}
-#endif
#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
@@ -1568,6 +1592,7 @@ static int __zone_reclaim(struct zone *z
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
+ .only_pages_with_naughty_cpusets = 1,
};
disable_swap_token();
_
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next reply other threads:[~2006-08-15 19:21 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-08-15 19:20 dave [this message]
2006-08-15 22:07 ` Paul Jackson
2006-08-15 22:24 ` Dave Hansen
2006-08-15 22:49 ` Paul Jackson
2006-08-17 10:41 ` Balbir Singh
2006-08-17 14:47 ` Dave Hansen
2006-08-18 3:33 ` Balbir Singh
2006-08-17 16:34 ` Paul Jackson
2006-08-16 5:44 ` Matt Helsley
2006-08-16 14:00 ` Balbir Singh
2006-08-16 20:31 ` Chandra Seetharaman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060815192047.EE4A0960@localhost.localdomain \
--to=dave@sr71.net \
--cc=balbir@in.ibm.com \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox