linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: dave@sr71.net
To: linux-mm@kvack.org
Cc: balbir@in.ibm.com, dave@sr71.net
Subject: [RFC][PATCH] "challenged" memory controller
Date: Tue, 15 Aug 2006 12:20:47 -0700	[thread overview]
Message-ID: <20060815192047.EE4A0960@localhost.localdomain> (raw)

I've been toying with a little memory controller for the past
few weeks, on and off.  My goal was to create something simple
and hackish that would at least be a toy to play with in the
process of creating something that might actually be feasible.

I call it "challenged" because it has some definite limitations.
However, it only adds about 50 lines of code to generic areas
of the VM, and I haven't been the slightest bit careful, yet.
I think it probably also breaks CONFIG_PM and !CONFIG_CPUSETS,
but those are "features". ;)

It uses cpusets for now, just because they are there, and are
relatively easy to modify.  The page->cpuset bit is only
temporary, and I have some plans to remove it later.

How does it work?  It adds two fields to the scan control
structure.  One that tells the scan to only pay attention to
_any_ cpuset over its memory limits, and the other to tell it
to only scan pages for a _particular_ cpuset.

I've been pretty indiscriminately hacking away, so I have the
feeling that there are some more efficient and nicer ways to
hook into the page scanning logic.  Comments are very welcome.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
---

 lxc-dave/include/linux/cpuset.h |    4 +
 lxc-dave/include/linux/gfp.h    |    4 -
 lxc-dave/include/linux/swap.h   |    2 
 lxc-dave/kernel/cpuset.c        |   99 +++++++++++++++++++++++++++++++++++++++-
 lxc-dave/mm/page_alloc.c        |   12 ++++
 lxc-dave/mm/rmap.c              |    5 ++
 lxc-dave/mm/vmscan.c            |   33 +++++++++++--
 7 files changed, 149 insertions(+), 10 deletions(-)

diff -puN include/linux/cpuset.h~challenged-memory-controller include/linux/cpuset.h
--- lxc/include/linux/cpuset.h~challenged-memory-controller	2006-08-14 13:22:12.000000000 -0700
+++ lxc-dave/include/linux/cpuset.h	2006-08-15 07:58:15.000000000 -0700
@@ -127,5 +127,9 @@ static inline int cpuset_do_slab_mem_spr
 }
 
 #endif /* !CONFIG_CPUSETS */
+int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask);
+void cpuset_dec_nr_pages(struct cpuset *cs, int nr);
+int cpuset_get_nr_pages(const struct cpuset *cs);
+int cpuset_amount_over_memory_max(const struct cpuset *cs);
 
 #endif /* _LINUX_CPUSET_H */
diff -puN include/linux/gfp.h~challenged-memory-controller include/linux/gfp.h
--- lxc/include/linux/gfp.h~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/include/linux/gfp.h	2006-08-15 07:47:34.000000000 -0700
@@ -108,10 +108,6 @@ static inline enum zone_type gfp_zone(gf
  * optimized to &contig_page_data at compile-time.
  */
 
-#ifndef HAVE_ARCH_FREE_PAGE
-static inline void arch_free_page(struct page *page, int order) { }
-#endif
-
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
diff -puN include/linux/mm.h~challenged-memory-controller include/linux/mm.h
diff -puN include/linux/sched.h~challenged-memory-controller include/linux/sched.h
diff -puN include/linux/swap.h~challenged-memory-controller include/linux/swap.h
--- lxc/include/linux/swap.h~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/include/linux/swap.h	2006-08-15 07:47:34.000000000 -0700
@@ -188,7 +188,7 @@ extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zone **, gfp_t);
-extern unsigned long shrink_all_memory(unsigned long nr_pages);
+extern unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
diff -puN kernel/cpuset.c~challenged-memory-controller kernel/cpuset.c
--- lxc/kernel/cpuset.c~challenged-memory-controller	2006-08-14 13:22:12.000000000 -0700
+++ lxc-dave/kernel/cpuset.c	2006-08-15 08:00:40.000000000 -0700
@@ -21,6 +21,7 @@
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/cpuset.h>
+#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/file.h>
@@ -46,6 +47,7 @@
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/swap.h>
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
@@ -97,6 +99,8 @@ struct cpuset {
 	 * recent time this cpuset changed its mems_allowed.
 	 */
 	int mems_generation;
+	int mems_nr_pages;
+	int mems_max_pages;
 
 	struct fmeter fmeter;		/* memory_pressure filter */
 };
@@ -112,6 +116,55 @@ typedef enum {
 	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
+int shrink_cpuset(struct cpuset *cs, gfp_t gfpmask, int tries)
+{
+	int nr_shrunk = 0;
+	while (cpuset_amount_over_memory_max(cs)) {
+		if (tries-- < 0)
+			break;
+		nr_shrunk += shrink_all_memory(10, cs);
+	}
+	return 0;
+}
+
+int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask)
+{
+	int ret;
+	if (!cs)
+		return 0;
+	cs->mems_nr_pages += nr;
+	if (cpuset_amount_over_memory_max(cs)) {
+		if (!(gfpmask & __GFP_WAIT))
+			return -ENOMEM;
+		ret = shrink_cpuset(cs, gfpmask, 50);
+	}
+	if (cpuset_amount_over_memory_max(cs))
+		return -ENOMEM;
+	return 0;
+}
+void cpuset_dec_nr_pages(struct cpuset *cs, int nr)
+{
+	if (!cs)
+		return;
+	cs->mems_nr_pages -= nr;
+}
+int cpuset_get_nr_pages(const struct cpuset *cs)
+{
+	return cs->mems_nr_pages;
+}
+int cpuset_amount_over_memory_max(const struct cpuset *cs)
+{
+	int amount;
+
+	if (!cs || cs->mems_max_pages < 0)
+		return 0;
+	amount = cs->mems_nr_pages - cs->mems_max_pages;
+	if (amount < 0)
+		amount = 0;
+	return amount;
+}
+
+
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -173,6 +226,8 @@ static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 	.cpus_allowed = CPU_MASK_ALL,
 	.mems_allowed = NODE_MASK_ALL,
+	.mems_nr_pages = 0,
+	.mems_max_pages = -1,
 	.count = ATOMIC_INIT(0),
 	.sibling = LIST_HEAD_INIT(top_cpuset.sibling),
 	.children = LIST_HEAD_INIT(top_cpuset.children),
@@ -1021,6 +1076,17 @@ static int update_memory_pressure_enable
 	return 0;
 }
 
+static int update_memory_max_nr_pages(struct cpuset *cs, char *buf)
+{
+	int rate = simple_strtol(buf, NULL, 10);
+	int shrunk;
+	int loopnr = 0;
+	cs->mems_max_pages = rate;
+	while (cpuset_amount_over_memory_max(cs))
+		shrunk = shrink_cpuset(cs, 0, 10);
+	return 0;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1109,6 +1175,7 @@ static int update_flag(cpuset_flagbits_t
  */
 
 #define FM_COEF 933		/* coefficient for half-life of 10 secs */
+#define FM_COEF 93		/* coefficient for half-life of 10 secs */
 #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
 #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
 #define FM_SCALE 1000		/* faux fixed point scale */
@@ -1263,6 +1330,8 @@ typedef enum {
 	FILE_NOTIFY_ON_RELEASE,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
+	FILE_MEMORY_MAX,
+	FILE_MEMORY_USED,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
 	FILE_TASKLIST,
@@ -1321,6 +1390,9 @@ static ssize_t cpuset_common_file_write(
 	case FILE_MEMORY_PRESSURE_ENABLED:
 		retval = update_memory_pressure_enabled(cs, buffer);
 		break;
+	case FILE_MEMORY_MAX:
+		retval = update_memory_max_nr_pages(cs, buffer);
+		break;
 	case FILE_MEMORY_PRESSURE:
 		retval = -EACCES;
 		break;
@@ -1441,6 +1513,12 @@ static ssize_t cpuset_common_file_read(s
 	case FILE_MEMORY_PRESSURE:
 		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
 		break;
+	case FILE_MEMORY_MAX:
+		s += sprintf(s, "%d", cs->mems_max_pages);
+		break;
+	case FILE_MEMORY_USED:
+		s += sprintf(s, "%d", cs->mems_nr_pages);
+		break;
 	case FILE_SPREAD_PAGE:
 		*s++ = is_spread_page(cs) ? '1' : '0';
 		break;
@@ -1785,6 +1863,16 @@ static struct cftype cft_cpu_exclusive =
 	.private = FILE_CPU_EXCLUSIVE,
 };
 
+static struct cftype cft_mem_used = {
+	.name = "memory_nr_pages",
+	.private = FILE_MEMORY_USED,
+};
+
+static struct cftype cft_mem_max = {
+	.name = "memory_max_pages",
+	.private = FILE_MEMORY_MAX,
+};
+
 static struct cftype cft_mem_exclusive = {
 	.name = "mem_exclusive",
 	.private = FILE_MEM_EXCLUSIVE,
@@ -1830,6 +1918,10 @@ static int cpuset_populate_dir(struct de
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mem_max)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_mem_used)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
@@ -1880,6 +1972,8 @@ static long cpuset_create(struct cpuset 
 	INIT_LIST_HEAD(&cs->sibling);
 	INIT_LIST_HEAD(&cs->children);
 	cs->mems_generation = cpuset_mems_generation++;
+	cs->mems_max_pages = parent->mems_max_pages;
+	cs->mems_nr_pages = 0;
 	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
@@ -1986,6 +2080,8 @@ int __init cpuset_init_early(void)
 
 	tsk->cpuset = &top_cpuset;
 	tsk->cpuset->mems_generation = cpuset_mems_generation++;
+	tsk->cpuset->mems_max_pages = -1;
+	tsk->cpuset->mems_nr_pages = 0;
 	return 0;
 }
 
@@ -2005,6 +2101,8 @@ int __init cpuset_init(void)
 
 	fmeter_init(&top_cpuset.fmeter);
 	top_cpuset.mems_generation = cpuset_mems_generation++;
+	top_cpuset.mems_max_pages = -1;
+	top_cpuset.mems_nr_pages = 0;
 
 	init_task.cpuset = &top_cpuset;
 
@@ -2438,7 +2536,6 @@ int cpuset_memory_pressure_enabled __rea
 void __cpuset_memory_pressure_bump(void)
 {
 	struct cpuset *cs;
-
 	task_lock(current);
 	cs = current->cpuset;
 	fmeter_markevent(&cs->fmeter);
diff -puN mm/page_alloc.c~challenged-memory-controller mm/page_alloc.c
--- lxc/mm/page_alloc.c~challenged-memory-controller	2006-08-14 13:24:16.000000000 -0700
+++ lxc-dave/mm/page_alloc.c	2006-08-15 07:57:13.000000000 -0700
@@ -470,6 +470,11 @@ static void free_one_page(struct zone *z
 	free_pages_bulk(zone, 1, &list, order);
 }
 
+void arch_free_page(struct page *page, int order)
+{
+	cpuset_dec_nr_pages(page->cpuset, 1<<order);
+}
+
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
@@ -1020,6 +1025,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 
 	might_sleep_if(wait);
 
+	if (cpuset_inc_nr_pages(current->cpuset, 1<<order, gfp_mask))
+		return NULL;
+
 restart:
 	z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 
@@ -1159,6 +1167,10 @@ got_pg:
 	if (page)
 		set_page_owner(page, order, gfp_mask);
 #endif
+	if (page)
+		page->cpuset = current->cpuset;
+	else
+		cpuset_dec_nr_pages(current->cpuset, 1<<order);
 	return page;
 }
 
diff -puN mm/rmap.c~challenged-memory-controller mm/rmap.c
--- lxc/mm/rmap.c~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/mm/rmap.c	2006-08-15 08:01:26.000000000 -0700
@@ -927,3 +927,8 @@ int try_to_unmap(struct page *page, int 
 	return ret;
 }
 
+extern int cpuset_amount_over_memory_max(const struct cpuset *cs);
+int page_has_naughty_cpuset(struct page *page)
+{
+	return cpuset_amount_over_memory_max(page->cpuset);
+}
diff -puN mm/vmscan.c~challenged-memory-controller mm/vmscan.c
--- lxc/mm/vmscan.c~challenged-memory-controller	2006-08-15 07:47:28.000000000 -0700
+++ lxc-dave/mm/vmscan.c	2006-08-15 08:05:03.000000000 -0700
@@ -63,8 +63,9 @@ struct scan_control {
 	int swap_cluster_max;
 
 	int swappiness;
-
 	int all_unreclaimable;
+	int only_pages_with_naughty_cpusets;
+	struct cpuset *only_this_cpuset;
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -445,6 +446,10 @@ static unsigned long shrink_page_list(st
 
 		VM_BUG_ON(PageActive(page));
 
+		if (cpuset_amount_over_memory_max(sc->only_this_cpuset) &&
+		    page->cpuset && page->cpuset != sc->only_this_cpuset) {
+			goto keep_locked;
+		}
 		sc->nr_scanned++;
 
 		if (!sc->may_swap && page_mapped(page))
@@ -793,9 +798,20 @@ force_reclaim_mapped:
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
+		extern int page_has_naughty_cpuset(struct page *page);
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
+		if (sc->only_this_cpuset &&
+		    page->cpuset && page->cpuset != sc->only_this_cpuset) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
+		if (sc->only_pages_with_naughty_cpusets &&
+		    !page_has_naughty_cpuset(page)) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
 		if (page_mapped(page)) {
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
@@ -875,6 +891,7 @@ static unsigned long shrink_zone(int pri
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	int nr_scans = 0;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -897,6 +914,11 @@ static unsigned long shrink_zone(int pri
 		nr_inactive = 0;
 
 	while (nr_active || nr_inactive) {
+		nr_scans++;
+		if (printk_ratelimit())
+			printk("%s() scan nr: %d\n", __func__, nr_scans);
+		if (nr_scans > 20)
+			sc->only_pages_with_naughty_cpusets = 0;
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
@@ -993,6 +1015,7 @@ unsigned long try_to_free_pages(struct z
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.may_swap = 1,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
 	};
 
 	delay_swap_prefetch();
@@ -1090,6 +1113,7 @@ static unsigned long balance_pgdat(pg_da
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
 	};
 
 loop_again:
@@ -1310,7 +1334,6 @@ void wakeup_kswapd(struct zone *zone, in
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
-#ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
  * from LRU lists system-wide, for given pass and priority, and returns the
@@ -1363,7 +1386,7 @@ static unsigned long shrink_all_zones(un
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
-unsigned long shrink_all_memory(unsigned long nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs)
 {
 	unsigned long lru_pages, nr_slab;
 	unsigned long ret = 0;
@@ -1376,6 +1399,8 @@ unsigned long shrink_all_memory(unsigned
 		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
+		.only_this_cpuset = cs,
 	};
 
 	delay_swap_prefetch();
@@ -1462,7 +1487,6 @@ out:
 
 	return ret;
 }
-#endif
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
@@ -1568,6 +1592,7 @@ static int __zone_reclaim(struct zone *z
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
+		.only_pages_with_naughty_cpusets = 1,
 	};
 
 	disable_swap_token();
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

             reply	other threads:[~2006-08-15 19:21 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-15 19:20 dave [this message]
2006-08-15 22:07 ` Paul Jackson
2006-08-15 22:24   ` Dave Hansen
2006-08-15 22:49     ` Paul Jackson
2006-08-17 10:41   ` Balbir Singh
2006-08-17 14:47     ` Dave Hansen
2006-08-18  3:33       ` Balbir Singh
2006-08-17 16:34     ` Paul Jackson
2006-08-16  5:44 ` Matt Helsley
2006-08-16 14:00 ` Balbir Singh
2006-08-16 20:31 ` Chandra Seetharaman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060815192047.EE4A0960@localhost.localdomain \
    --to=dave@sr71.net \
    --cc=balbir@in.ibm.com \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox