* [PATCH 1/6] CKRM: Basic changes to the core kernel
@ 2005-04-02 3:12 Chandra Seetharaman
2005-04-04 13:45 ` Dave Hansen
0 siblings, 1 reply; 9+ messages in thread
From: Chandra Seetharaman @ 2005-04-02 3:12 UTC (permalink / raw)
To: ckrm-tech, linux-mm
[-- Attachment #1: Type: text/plain, Size: 287 bytes --]
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
----------------------------------------------------------------------
[-- Attachment #2: 11-01-mem_base_changes --]
[-- Type: text/plain, Size: 10454 bytes --]
Patch 1 of 6 patches to support memory controller under CKRM framework.
This patch has the basic changes needed to get the hooks in the appropriate
kernel functions to get control in the controller.
fs/exec.c | 2 +
include/linux/ckrm_mem_inline.h | 67 ++++++++++++++++++++++++++++++++++++++++
include/linux/mm_inline.h | 7 ++++
include/linux/page-flags.h | 7 ++++
include/linux/sched.h | 8 ++++
init/Kconfig | 9 +++++
kernel/exit.c | 2 +
kernel/fork.c | 6 +++
mm/page_alloc.c | 6 +++
9 files changed, 114 insertions(+)
Index: linux-2.6.12-rc1/fs/exec.c
===================================================================
--- linux-2.6.12-rc1.orig/fs/exec.c
+++ linux-2.6.12-rc1/fs/exec.c
@@ -49,6 +49,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/ckrm_events.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -574,6 +575,7 @@ static int exec_mmap(struct mm_struct *m
activate_mm(active_mm, mm);
task_unlock(tsk);
arch_pick_mmap_layout(mm);
+ ckrm_task_mm_change(tsk, old_mm, mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
if (active_mm != old_mm) BUG();
Index: linux-2.6.12-rc1/include/linux/ckrm_mem_inline.h
===================================================================
--- /dev/null
+++ linux-2.6.12-rc1/include/linux/ckrm_mem_inline.h
@@ -0,0 +1,67 @@
+/* include/linux/ckrm_mem_inline.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ * (C) Shailabh Nagar, IBM Corp. 2003
+ * (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ *
+ * Memory control functions of the CKRM kernel API
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_CKRM_MEM_INLINE_H_
+#define _LINUX_CKRM_MEM_INLINE_H_
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#error "Memory controller for CKRM is not available."
+
+#else
+
+static inline void
+ckrm_task_mm_init(struct task_struct *tsk)
+{
+}
+
+static inline void
+ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task)
+{
+}
+
+static inline void
+ckrm_task_mm_change(struct task_struct *tsk,
+ struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+}
+
+static inline void
+ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm)
+{
+}
+
+static inline void
+ckrm_mm_init(struct mm_struct *mm)
+{
+}
+
+/* using #define instead of static inline as the prototype requires *
+ * data structures that is available only with the controller enabled */
+#define ckrm_mm_setclass(a, b) do { } while(0)
+#define ckrm_class_limit_ok(a) (1)
+
+static inline void ckrm_mem_inc_active(struct page *p) {}
+static inline void ckrm_mem_dec_active(struct page *p) {}
+static inline void ckrm_mem_inc_inactive(struct page *p) {}
+static inline void ckrm_mem_dec_inactive(struct page *p) {}
+static inline void ckrm_page_init(struct page *p) {}
+static inline void ckrm_clear_page_class(struct page *p) {}
+
+#endif
+#endif /* _LINUX_CKRM_MEM_INLINE_H_ */
Index: linux-2.6.12-rc1/include/linux/mm_inline.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/mm_inline.h
+++ linux-2.6.12-rc1/include/linux/mm_inline.h
@@ -1,9 +1,11 @@
+#include <linux/ckrm_mem_inline.h>
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->active_list);
zone->nr_active++;
+ ckrm_mem_inc_active(page);
}
static inline void
@@ -11,6 +13,7 @@ add_page_to_inactive_list(struct zone *z
{
list_add(&page->lru, &zone->inactive_list);
zone->nr_inactive++;
+ ckrm_mem_inc_inactive(page);
}
static inline void
@@ -18,6 +21,7 @@ del_page_from_active_list(struct zone *z
{
list_del(&page->lru);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
}
static inline void
@@ -25,6 +29,7 @@ del_page_from_inactive_list(struct zone
{
list_del(&page->lru);
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
static inline void
@@ -34,7 +39,9 @@ del_page_from_lru(struct zone *zone, str
if (PageActive(page)) {
ClearPageActive(page);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
} else {
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
}
Index: linux-2.6.12-rc1/include/linux/page-flags.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/page-flags.h
+++ linux-2.6.12-rc1/include/linux/page-flags.h
@@ -76,6 +76,7 @@
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_nosave_free 19 /* Free, should not be written */
#define PG_uncached 20 /* Page has been mapped as uncached */
+#define PG_ckrm_account 21 /* CKRM accounting */
/*
* Global page accounting. One instance per CPU. Only unsigned longs are
@@ -305,6 +306,12 @@ extern void __mod_page_state(unsigned of
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#ifdef CONFIG_CKRM_RES_MEM
+#define PageCkrmAccount(page) test_bit(PG_ckrm_account, &(page)->flags)
+#define SetPageCkrmAccount(page) set_bit(PG_ckrm_account, &(page)->flags)
+#define ClearPageCkrmAccount(page) clear_bit(PG_ckrm_account, &(page)->flags)
+#endif
+
struct page; /* forward declaration */
int test_clear_page_dirty(struct page *page);
Index: linux-2.6.12-rc1/include/linux/sched.h
===================================================================
--- linux-2.6.12-rc1.orig/include/linux/sched.h
+++ linux-2.6.12-rc1/include/linux/sched.h
@@ -258,6 +258,11 @@ struct mm_struct {
unsigned long hiwater_rss; /* High-water RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
+#ifdef CONFIG_CKRM_RES_MEM
+ struct ckrm_mem_res *memclass;
+ struct list_head tasklist; /* tasks sharing this address space */
+ spinlock_t peertask_lock; /* protect tasklist above */
+#endif
};
struct sighand_struct {
@@ -735,6 +740,9 @@ struct task_struct {
struct ckrm_task_class *taskclass;
struct list_head taskclass_link;
#endif /* CONFIG_CKRM_TYPE_TASKCLASS */
+#ifdef CONFIG_CKRM_RES_MEM
+ struct list_head mm_peers; /* list of tasks using same mm_struct */
+#endif
#endif /* CONFIG_CKRM */
#ifdef CONFIG_DELAY_ACCT
struct task_delay_info delays;
Index: linux-2.6.12-rc1/init/Kconfig
===================================================================
--- linux-2.6.12-rc1.orig/init/Kconfig
+++ linux-2.6.12-rc1/init/Kconfig
@@ -174,6 +174,15 @@ config CKRM_TYPE_TASKCLASS
Say Y if unsure
+config CKRM_RES_MEM
+ bool "Class based physical memory controller"
+ default y
+ depends on CKRM_TYPE_TASKCLASS
+ help
+ Provide the basic support for collecting physical memory usage
+ information among classes. Say Y if you want to know the memory
+ usage of each class.
+
config CKRM_TYPE_SOCKETCLASS
bool "Class Manager for socket groups"
depends on CKRM && RCFS_FS
Index: linux-2.6.12-rc1/kernel/exit.c
===================================================================
--- linux-2.6.12-rc1.orig/kernel/exit.c
+++ linux-2.6.12-rc1/kernel/exit.c
@@ -28,6 +28,7 @@
#include <linux/cpuset.h>
#include <linux/syscalls.h>
#include <linux/ckrm_events.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -505,6 +506,7 @@ void exit_mm(struct task_struct * tsk)
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
+ ckrm_task_mm_clear(tsk, mm);
enter_lazy_tlb(mm, current);
task_unlock(tsk);
mmput(mm);
Index: linux-2.6.12-rc1/kernel/fork.c
===================================================================
--- linux-2.6.12-rc1.orig/kernel/fork.c
+++ linux-2.6.12-rc1/kernel/fork.c
@@ -44,6 +44,7 @@
#include <linux/ckrm_events.h>
#include <linux/ckrm_tsk.h>
#include <linux/ckrm_tc.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -178,6 +179,7 @@ static struct task_struct *dup_task_stru
ti->task = tsk;
ckrm_cb_newtask(tsk);
+ ckrm_task_mm_init(tsk);
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
return tsk;
@@ -326,6 +328,7 @@ static struct mm_struct * mm_init(struct
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
+ ckrm_mm_init(mm);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -346,6 +349,7 @@ struct mm_struct * mm_alloc(void)
if (mm) {
memset(mm, 0, sizeof(*mm));
mm = mm_init(mm);
+ ckrm_mm_setclass(mm, ckrm_get_mem_class(current));
}
return mm;
}
@@ -502,6 +506,8 @@ static int copy_mm(unsigned long clone_f
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
+ ckrm_mm_setclass(mm, oldmm->memclass);
+ ckrm_task_mm_set(mm, tsk);
return 0;
free_pt:
Index: linux-2.6.12-rc1/mm/page_alloc.c
===================================================================
--- linux-2.6.12-rc1.orig/mm/page_alloc.c
+++ linux-2.6.12-rc1/mm/page_alloc.c
@@ -34,6 +34,7 @@
#include <linux/cpuset.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -355,6 +356,7 @@ free_pages_bulk(struct zone *zone, int c
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, zone, order);
+ ckrm_clear_page_class(page);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -454,6 +456,7 @@ static void prep_new_page(struct page *p
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
page->private = 0;
+ ckrm_page_init(page);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);
}
@@ -749,6 +752,9 @@ __alloc_pages(unsigned int gfp_mask, uns
*/
can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p)))
+ return NULL;
+
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (unlikely(zones[0] == NULL)) {
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] CKRM: Basic changes to the core kernel
2005-04-02 3:12 [PATCH 1/6] CKRM: Basic changes to the core kernel Chandra Seetharaman
@ 2005-04-04 13:45 ` Dave Hansen
2005-04-05 17:25 ` Chandra Seetharaman
0 siblings, 1 reply; 9+ messages in thread
From: Dave Hansen @ 2005-04-04 13:45 UTC (permalink / raw)
To: Chandra Seetharaman; +Cc: ckrm-tech, linux-mm
> static inline void
> add_page_to_active_list(struct zone *zone, struct page *page)
> {
> list_add(&page->lru, &zone->active_list);
> zone->nr_active++;
> + ckrm_mem_inc_active(page);
> }
Are any of the current zone statistics used any more when this is
compiled in?
Also, why does everything have to say ckrm_* on it? What if somebody
else comes along and wants to use the same functions to do some other
kind of accounting?
I think names like this are plenty long and descriptive enough:
mem_inc_active(page);
clear_page_class(page);
set_page_class(...);
I'd drop the "ckrm_".
> +#define PG_ckrm_account 21 /* CKRM accounting */
Are you sure you really need this bit *and* a whole new pointer in
'struct page'? We already do some tricks with ->mapping so that we can
tell what is stored in it. You could easily do something with the low
bit of your new structure member.
> @@ -355,6 +356,7 @@ free_pages_bulk(struct zone *zone, int c
> /* have to delete it as __free_pages_bulk list manipulates */
> list_del(&page->lru);
> __free_pages_bulk(page, zone, order);
> + ckrm_clear_page_class(page);
> ret++;
> }
> spin_unlock_irqrestore(&zone->lock, flags);
When your option is on, how costly is the addition of code, here? How
much does it hurt the microbenchmarks? How much larger does it
make .text?
> + if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p)))
> + return NULL;
ckrm_class_limit_ok() is called later on in the same hot path, and
there's a for loop in there over each zone. How expensive is this on
SGI's machines? What about an 8-node x44[05]? Why can't you call it
from interrupts?
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] CKRM: Basic changes to the core kernel
2005-04-04 13:45 ` Dave Hansen
@ 2005-04-05 17:25 ` Chandra Seetharaman
2005-04-05 17:54 ` Dave Hansen
0 siblings, 1 reply; 9+ messages in thread
From: Chandra Seetharaman @ 2005-04-05 17:25 UTC (permalink / raw)
To: Dave Hansen; +Cc: ckrm-tech, linux-mm
On Mon, Apr 04, 2005 at 06:45:13AM -0700, Dave Hansen wrote:
> > static inline void
> > add_page_to_active_list(struct zone *zone, struct page *page)
> > {
> > list_add(&page->lru, &zone->active_list);
> > zone->nr_active++;
> > + ckrm_mem_inc_active(page);
> > }
>
> Are any of the current zone statistics used any more when this is
> compiled in?
They are being used. The reason I left them is that if you want those
statictics with just ckrm info, we need to go thru all the defined classes,
which might be costly, depend on the number of classes.
>
> Also, why does everything have to say ckrm_* on it? What if somebody
> else comes along and wants to use the same functions to do some other
> kind of accounting?
>
> I think names like this are plenty long and descriptive enough:
>
> mem_inc_active(page);
> clear_page_class(page);
> set_page_class(...);
>
> I'd drop the "ckrm_".
Because we got some review comments to keep it that way :).... Currently
they do ckrm specific things. In future if that changes, we can change the
name too.
>
> > +#define PG_ckrm_account 21 /* CKRM accounting */
>
> Are you sure you really need this bit *and* a whole new pointer in
> 'struct page'? We already do some tricks with ->mapping so that we can
> tell what is stored in it. You could easily do something with the low
> bit of your new structure member.
I think I canavoid using the bit. The problem with having a pointer in page
data structure is two-fold:
1. goes over the page-cahe (I ran cache-bench with mem controller
enabled, and didn't see much of a difference. will post the
new results sometime soon)
2. additional memory used, especially in large systems
Using the mapping logic, we can avoid problem (1), but increase problem (2)
with added complexity and run-time logic. I am looking a way to avoid both
the problems, any help appreciated.
>
> > @@ -355,6 +356,7 @@ free_pages_bulk(struct zone *zone, int c
> > /* have to delete it as __free_pages_bulk list manipulates */
> > list_del(&page->lru);
> > __free_pages_bulk(page, zone, order);
> > + ckrm_clear_page_class(page);
> > ret++;
> > }
> > spin_unlock_irqrestore(&zone->lock, flags);
>
> When your option is on, how costly is the addition of code, here? How
> much does it hurt the microbenchmarks? How much larger does it
As I said earlier cache-bench doesn't show much effect. Will post that and
other results sometime soon.
> make .text?
------------------ 2612-rc1.... no memory controller patch applied
vmlinux-nomem: file format elf32-i386
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 002455d5 c0100000 c0100000 00001000 2**4
CONTENTS, ALLOC, LOAD, READONLY, CODE
------------------ 2612-rc1.... mem ctlr patch applied, config turned off
vmlinux-mem_out: file format elf32-i386
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 00245575 c0100000 c0100000 00001000 2**4
CONTENTS, ALLOC, LOAD, READONLY, CODE
------------------ 2612-rc1.... mem ctlr patch applied, config turned on
vmlinux-mem_in: file format elf32-i386
Sections:
Idx Name Size VMA LMA File off Algn
0 .text 00248195 c0100000 c0100000 00001000 2**4
CONTENTS, ALLOC, LOAD, READONLY, CODE
------------------
>
> > + if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p)))
> > + return NULL;
>
> ckrm_class_limit_ok() is called later on in the same hot path, and
> there's a for loop in there over each zone. How expensive is this on
It doesn't get into the for loop unless the class is over the limit(which
is not a frequent event). Also, the loop is just to wakeup kswapd once..
may be I can get rid of that and use pgdat_list directly.
> SGI's machines? What about an 8-node x44[05]? Why can't you call it
> from interrupts?
I just wanted to avoid limit related failures in interrupt context, as it
might lead to wierd problems.
>
> -- Dave
>
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
----------------------------------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] CKRM: Basic changes to the core kernel
2005-04-05 17:25 ` Chandra Seetharaman
@ 2005-04-05 17:54 ` Dave Hansen
2005-04-05 18:22 ` Chandra Seetharaman
0 siblings, 1 reply; 9+ messages in thread
From: Dave Hansen @ 2005-04-05 17:54 UTC (permalink / raw)
To: Chandra Seetharaman; +Cc: ckrm-tech, linux-mm
On Tue, 2005-04-05 at 10:25 -0700, Chandra Seetharaman wrote:
> > > +#define PG_ckrm_account 21 /* CKRM accounting */
> >
> > Are you sure you really need this bit *and* a whole new pointer in
> > 'struct page'? We already do some tricks with ->mapping so that we can
> > tell what is stored in it. You could easily do something with the low
> > bit of your new structure member.
>
> I think I canavoid using the bit. The problem with having a pointer in page
> data structure is two-fold:
> 1. goes over the page-cahe (I ran cache-bench with mem controller
> enabled, and didn't see much of a difference. will post the
> new results sometime soon)
> 2. additional memory used, especially in large systems
>
> Using the mapping logic, we can avoid problem (1), but increase problem (2)
> with added complexity and run-time logic. I am looking a way to avoid both
> the problems, any help appreciated.
First of all, why do you need to track individual pages? Seems a little
bit silly to charge the first user of something like a commonly-mapped
library for all users.
For instance, when you have your super-partitioned-CKRMed-eWLM-apache
server, doesn't the first class to execute apache get charged for all of
the pages in the executable and the libraries? Won't any subsequent
user classes get it "for free"? Perhaps tracking which classes have
mapped pages and sharing the cost among them is a more reasonable
measurement.
If you find a way to track things based on files, you could keep your
class pointers in the struct address_space, or even in the vma,
depending on what behavior you want. You could keep anonymous stuff in
the anon_vma, just like the objrmap code.
> > > @@ -355,6 +356,7 @@ free_pages_bulk(struct zone *zone, int c
> > > /* have to delete it as __free_pages_bulk list manipulates */
> > > list_del(&page->lru);
> > > __free_pages_bulk(page, zone, order);
> > > + ckrm_clear_page_class(page);
> > > ret++;
> > > }
> > > spin_unlock_irqrestore(&zone->lock, flags);
> >
> > When your option is on, how costly is the addition of code, here? How
> > much does it hurt the microbenchmarks? How much larger does it
>
> As I said earlier cache-bench doesn't show much effect. Will post that and
> other results sometime soon.
...
Looks like only 3k.
> > > + if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p)))
> > > + return NULL;
> >
> > ckrm_class_limit_ok() is called later on in the same hot path, and
> > there's a for loop in there over each zone. How expensive is this on
>
> It doesn't get into the for loop unless the class is over the limit(which
> is not a frequent event)
... if the class is behaving itself. Somebody trying to take down a
machine, or a single badly-behaved or runaway app might not behave like
that.
> Also, the loop is just to wakeup kswapd once..
> may be I can get rid of that and use pgdat_list directly.
I'd try to be a little more selective than a big for loop like that.
> > SGI's machines? What about an 8-node x44[05]? Why can't you call it
> > from interrupts?
>
> I just wanted to avoid limit related failures in interrupt context, as it
> might lead to wierd problems.
You mean you didn't want to make your code robust enough to handle it?
Is there something fundamental keeping you from checking limits when in
an interrupt?
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] CKRM: Basic changes to the core kernel
2005-04-05 17:54 ` Dave Hansen
@ 2005-04-05 18:22 ` Chandra Seetharaman
2005-04-05 18:57 ` Dave Hansen
0 siblings, 1 reply; 9+ messages in thread
From: Chandra Seetharaman @ 2005-04-05 18:22 UTC (permalink / raw)
To: Dave Hansen; +Cc: ckrm-tech, linux-mm
On Tue, Apr 05, 2005 at 10:54:20AM -0700, Dave Hansen wrote:
> On Tue, 2005-04-05 at 10:25 -0700, Chandra Seetharaman wrote:
> First of all, why do you need to track individual pages? Seems a little
> bit silly to charge the first user of something like a commonly-mapped
> library for all users.
>
> For instance, when you have your super-partitioned-CKRMed-eWLM-apache
> server, doesn't the first class to execute apache get charged for all of
> the pages in the executable and the libraries? Won't any subsequent
> user classes get it "for free"? Perhaps tracking which classes have
> mapped pages and sharing the cost among them is a more reasonable
> measurement.
>
> If you find a way to track things based on files, you could keep your
> class pointers in the struct address_space, or even in the vma,
> depending on what behavior you want. You could keep anonymous stuff in
> the anon_vma, just like the objrmap code.
This is the first version of memory controller... Handling shared pages
appropriately are in the plans.
>
> > > > @@ -355,6 +356,7 @@ free_pages_bulk(struct zone *zone, int c
> > > > /* have to delete it as __free_pages_bulk list manipulates */
> > > > list_del(&page->lru);
> > > > __free_pages_bulk(page, zone, order);
> > > > + ckrm_clear_page_class(page);
> > > > ret++;
> > > > }
> > > > spin_unlock_irqrestore(&zone->lock, flags);
> > >
> > > When your option is on, how costly is the addition of code, here? How
> > > much does it hurt the microbenchmarks? How much larger does it
> >
> > As I said earlier cache-bench doesn't show much effect. Will post that and
> > other results sometime soon.
> ...
>
> Looks like only 3k.
>
> > > > + if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p)))
> > > > + return NULL;
> > >
> > > ckrm_class_limit_ok() is called later on in the same hot path, and
> > > there's a for loop in there over each zone. How expensive is this on
> >
> > It doesn't get into the for loop unless the class is over the limit(which
> > is not a frequent event)
>
> ... if the class is behaving itself. Somebody trying to take down a
> machine, or a single badly-behaved or runaway app might not behave like
> that.
There are checks in that code to make sure that a runaway app doesn't
get the kernel into this code path often and bring down the system...
instead the runaway app(its class) is penalised.
>
> > Also, the loop is just to wakeup kswapd once..
> > may be I can get rid of that and use pgdat_list directly.
>
> I'd try to be a little more selective than a big for loop like that.
'big' for loop ? in that code path ?
>
> > > SGI's machines? What about an 8-node x44[05]? Why can't you call it
> > > from interrupts?
> >
> > I just wanted to avoid limit related failures in interrupt context, as it
> > might lead to wierd problems.
>
> You mean you didn't want to make your code robust enough to handle it?
> Is there something fundamental keeping you from checking limits when in
> an interrupt?
It is not the 'checking limit' part that I meant in my reply. It is the
failure due to over limit(that the class is over its limit).
This is my thinking: if a class is not configured properly, and is over
its limit in interrupt context, we are going to fail the memory alloc,
which 'could' lead to unwanted results in the system depending on how the
interrupt handler treats the alloc failure ;)...
May be I don't have to think that far...
Let me make it clear, there is no CKRM specific reasoning for that check.
>
> -- Dave
>
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
----------------------------------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] CKRM: Basic changes to the core kernel
2005-04-05 18:22 ` Chandra Seetharaman
@ 2005-04-05 18:57 ` Dave Hansen
2005-04-05 19:38 ` Chandra Seetharaman
0 siblings, 1 reply; 9+ messages in thread
From: Dave Hansen @ 2005-04-05 18:57 UTC (permalink / raw)
To: Chandra Seetharaman; +Cc: ckrm-tech, linux-mm
On Tue, 2005-04-05 at 11:22 -0700, Chandra Seetharaman wrote:
> On Tue, Apr 05, 2005 at 10:54:20AM -0700, Dave Hansen wrote:
> > If you find a way to track things based on files, you could keep your
> > class pointers in the struct address_space, or even in the vma,
> > depending on what behavior you want. You could keep anonymous stuff in
> > the anon_vma, just like the objrmap code.
>
> This is the first version of memory controller... Handling shared pages
> appropriately are in the plans.
Perhaps it's a better idea to wait until you have this more mature
version before submitting it. It would be a shame to put all of this
per-page stuff in, only to rip it out. Doing it that way isn't very
incremental, but I don't think they'd share too much code anyway.
> > ... if the class is behaving itself. Somebody trying to take down a
> > machine, or a single badly-behaved or runaway app might not behave like
> > that.
>
> There are checks in that code to make sure that a runaway app doesn't
> get the kernel into this code path often and bring down the system...
> instead the runaway app(its class) is penalised.
Penalized how? Reducing the task's scheduler slices? Can you point to
the code?
> > > Also, the loop is just to wakeup kswapd once..
> > > may be I can get rid of that and use pgdat_list directly.
> >
> > I'd try to be a little more selective than a big for loop like that.
>
> 'big' for loop ? in that code path ?
> ckrm_class_limit_ok(struct ckrm_mem_res *cls)
> {
...
> + for (i = 0; i < MAX_NR_ZONES; i++)
> + pg_total += cls->pg_total[i];
Sorry, I was confusing this with something equivalent to
for_each_node().
That brings another question, though. How does this interact with NUMA?
The classes don't appear to track any per-node information.
> + if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
> + struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
> + mem_rcbs.resid, struct ckrm_mem_res);
> + ret = (parcls ? ckrm_class_limit_ok(parcls) : 0);
> + } else
> + ret = (pg_total <= cls->pg_limit);
> +
> + return ret;
That looks suspiciously like recursion. How is the recursion limited?
> > > > SGI's machines? What about an 8-node x44[05]? Why can't you call it
> > > > from interrupts?
> > >
> > > I just wanted to avoid limit related failures in interrupt context, as it
> > > might lead to wierd problems.
> >
> > You mean you didn't want to make your code robust enough to handle it?
> > Is there something fundamental keeping you from checking limits when in
> > an interrupt?
>
> It is not the 'checking limit' part that I meant in my reply. It is the
> failure due to over limit(that the class is over its limit).
>
> This is my thinking: if a class is not configured properly, and is over
> its limit in interrupt context, we are going to fail the memory alloc,
> which 'could' lead to unwanted results in the system depending on how the
> interrupt handler treats the alloc failure ;)...
No.
Interrupt handlers must use GFP_ATOMIC when allocating. This
allocations are likely to fail, and the writers of the handlers know it.
Interrupts handlers must be equipped to deal with these, or it's a bug
in the interrupt handler.
Is there any other reason to have the !in_interrupt() part?
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] CKRM: Basic changes to the core kernel
2005-04-05 18:57 ` Dave Hansen
@ 2005-04-05 19:38 ` Chandra Seetharaman
0 siblings, 0 replies; 9+ messages in thread
From: Chandra Seetharaman @ 2005-04-05 19:38 UTC (permalink / raw)
To: Dave Hansen; +Cc: ckrm-tech, linux-mm
On Tue, Apr 05, 2005 at 11:57:50AM -0700, Dave Hansen wrote:
> On Tue, 2005-04-05 at 11:22 -0700, Chandra Seetharaman wrote:
> > On Tue, Apr 05, 2005 at 10:54:20AM -0700, Dave Hansen wrote:
> > > If you find a way to track things based on files, you could keep your
> > > class pointers in the struct address_space, or even in the vma,
> > > depending on what behavior you want. You could keep anonymous stuff in
> > > the anon_vma, just like the objrmap code.
> >
> > This is the first version of memory controller... Handling shared pages
> > appropriately are in the plans.
>
> Perhaps it's a better idea to wait until you have this more mature
I guess you are confusing maturity with functionality. Shared page support
is the next level of functionality we are planning to provide.....
Our thinking is that the controller is mature with its current features.
> version before submitting it. It would be a shame to put all of this
I thought the mantra was, "release often, release early"....
> per-page stuff in, only to rip it out. Doing it that way isn't very
> incremental, but I don't think they'd share too much code anyway.
I think otherwise... and I do think the changes will be incremental.
>
> > > ... if the class is behaving itself. Somebody trying to take down a
> > > machine, or a single badly-behaved or runaway app might not behave like
> > > that.
> >
> > There are checks in that code to make sure that a runaway app doesn't
> > get the kernel into this code path often and bring down the system...
> > instead the runaway app(its class) is penalised.
>
> Penalized how? Reducing the task's scheduler slices? Can you point to
> the code?
This is a memory controller.... it doesn't do(and not expected to do)
scheduler operations. It penalises classes that go over their limit
often by failing the memory alloc (without giving the class a chance to
shrink itself).
Same code path that we are currently talking about.
>
> > > > Also, the loop is just to wakeup kswapd once..
> > > > may be I can get rid of that and use pgdat_list directly.
> > >
> > > I'd try to be a little more selective than a big for loop like that.
> >
> > 'big' for loop ? in that code path ?
>
> > ckrm_class_limit_ok(struct ckrm_mem_res *cls)
> > {
> ...
> > + for (i = 0; i < MAX_NR_ZONES; i++)
> > + pg_total += cls->pg_total[i];
>
> Sorry, I was confusing this with something equivalent to
> for_each_node().
Good... I started to wonder if we are even looking at the same patches...
>
> That brings another question, though. How does this interact with NUMA?
> The classes don't appear to track any per-node information.
It doesn't know/care about NUMA.... It is a resource controller that
controls the 'system-wide' memory usage by different classes... May be we
will think about NUMA support in future...
>
> > + if (cls->pg_limit == CKRM_SHARE_DONTCARE) {
> > + struct ckrm_mem_res *parcls = ckrm_get_res_class(cls->parent,
> > + mem_rcbs.resid, struct ckrm_mem_res);
> > + ret = (parcls ? ckrm_class_limit_ok(parcls) : 0);
> > + } else
> > + ret = (pg_total <= cls->pg_limit);
> > +
> > + return ret;
>
> That looks suspiciously like recursion. How is the recursion limited?
By limit the level of class hierarchy to 2.
>
> > > > > SGI's machines? What about an 8-node x44[05]? Why can't you call it
> > > > > from interrupts?
> > > >
> > > > I just wanted to avoid limit related failures in interrupt context, as it
> > > > might lead to wierd problems.
> > >
> > > You mean you didn't want to make your code robust enough to handle it?
> > > Is there something fundamental keeping you from checking limits when in
> > > an interrupt?
> >
> > It is not the 'checking limit' part that I meant in my reply. It is the
> > failure due to over limit(that the class is over its limit).
> >
> > This is my thinking: if a class is not configured properly, and is over
> > its limit in interrupt context, we are going to fail the memory alloc,
> > which 'could' lead to unwanted results in the system depending on how the
> > interrupt handler treats the alloc failure ;)...
>
> No.
>
> Interrupt handlers must use GFP_ATOMIC when allocating. This
> allocations are likely to fail, and the writers of the handlers know it.
> Interrupts handlers must be equipped to deal with these, or it's a bug
> in the interrupt handler.
>
> Is there any other reason to have the !in_interrupt() part?
I already stated explicitly that there is NO specific reason to have it.
>
> -- Dave
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
----------------------------------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/6] CKRM: Basic changes to the core kernel
@ 2005-06-24 22:21 Chandra Seetharaman
0 siblings, 0 replies; 9+ messages in thread
From: Chandra Seetharaman @ 2005-06-24 22:21 UTC (permalink / raw)
To: ckrm-tech, linux-mm
Patch 1 of 6 patches to support memory controller under CKRM framework.
This patch has the basic changes needed to get the hooks in the
appropriate
kernel functions to get control in the controller.
----------------------------------------
fs/exec.c | 2 +
include/linux/ckrm_mem_inline.h | 67 ++++++++++++++++++++++++++++++++
++++++++
include/linux/mm_inline.h | 7 ++++
include/linux/sched.h | 8 ++++
init/Kconfig | 10 +++++
kernel/exit.c | 2 +
kernel/fork.c | 6 +++
mm/page_alloc.c | 6 +++
8 files changed, 108 insertions(+)
Content-Disposition: inline; filename=11-01-mem_base_changes
Index: linux-2.6.12/fs/exec.c
===================================================================
--- linux-2.6.12.orig/fs/exec.c
+++ linux-2.6.12/fs/exec.c
@@ -49,6 +49,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/ckrm_events.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -575,6 +576,7 @@ static int exec_mmap(struct mm_struct *m
activate_mm(active_mm, mm);
task_unlock(tsk);
arch_pick_mmap_layout(mm);
+ ckrm_task_mm_change(tsk, old_mm, mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
if (active_mm != old_mm) BUG();
Index: linux-2.6.12/include/linux/ckrm_mem_inline.h
===================================================================
--- /dev/null
+++ linux-2.6.12/include/linux/ckrm_mem_inline.h
@@ -0,0 +1,67 @@
+/* include/linux/ckrm_mem_inline.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ * (C) Shailabh Nagar, IBM Corp. 2003
+ * (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ *
+ * Memory control functions of the CKRM kernel API
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_CKRM_MEM_INLINE_H_
+#define _LINUX_CKRM_MEM_INLINE_H_
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#error "Memory controller for CKRM is not available."
+
+#else
+
+static inline void
+ckrm_task_mm_init(struct task_struct *tsk)
+{
+}
+
+static inline void
+ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task)
+{
+}
+
+static inline void
+ckrm_task_mm_change(struct task_struct *tsk,
+ struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+}
+
+static inline void
+ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm)
+{
+}
+
+static inline void
+ckrm_mm_init(struct mm_struct *mm)
+{
+}
+
+/* using #define instead of static inline as the prototype requires *
+ * data structures that is available only with the controller enabled
*/
+#define ckrm_mm_setclass(a, b) do { } while(0)
+#define ckrm_class_limit_ok(a) (1)
+
+static inline void ckrm_mem_inc_active(struct page *p) {}
+static inline void ckrm_mem_dec_active(struct page *p) {}
+static inline void ckrm_mem_inc_inactive(struct page *p) {}
+static inline void ckrm_mem_dec_inactive(struct page *p) {}
+static inline void ckrm_page_init(struct page *p) {}
+static inline void ckrm_clear_page_class(struct page *p) {}
+
+#endif
+#endif /* _LINUX_CKRM_MEM_INLINE_H_ */
Index: linux-2.6.12/include/linux/mm_inline.h
===================================================================
--- linux-2.6.12.orig/include/linux/mm_inline.h
+++ linux-2.6.12/include/linux/mm_inline.h
@@ -1,9 +1,11 @@
+#include <linux/ckrm_mem_inline.h>
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->active_list);
zone->nr_active++;
+ ckrm_mem_inc_active(page);
}
static inline void
@@ -11,6 +13,7 @@ add_page_to_inactive_list(struct zone *z
{
list_add(&page->lru, &zone->inactive_list);
zone->nr_inactive++;
+ ckrm_mem_inc_inactive(page);
}
static inline void
@@ -18,6 +21,7 @@ del_page_from_active_list(struct zone *z
{
list_del(&page->lru);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
}
static inline void
@@ -25,6 +29,7 @@ del_page_from_inactive_list(struct zone
{
list_del(&page->lru);
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
static inline void
@@ -34,7 +39,9 @@ del_page_from_lru(struct zone *zone, str
if (PageActive(page)) {
ClearPageActive(page);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
} else {
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
}
Index: linux-2.6.12/include/linux/sched.h
===================================================================
--- linux-2.6.12.orig/include/linux/sched.h
+++ linux-2.6.12/include/linux/sched.h
@@ -268,6 +268,11 @@ struct mm_struct {
unsigned long hiwater_rss; /* High-water RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
+#ifdef CONFIG_CKRM_RES_MEM
+ struct ckrm_mem_res *memclass;
+ struct list_head tasklist; /* tasks sharing this address space */
+ spinlock_t peertask_lock; /* protect tasklist above */
+#endif
};
struct sighand_struct {
@@ -748,6 +753,9 @@ struct task_struct {
struct ckrm_task_class *taskclass;
struct list_head taskclass_link;
#endif /* CONFIG_CKRM_TYPE_TASKCLASS */
+#ifdef CONFIG_CKRM_RES_MEM
+ struct list_head mm_peers; /* list of tasks using same mm_struct */
+#endif
#endif /* CONFIG_CKRM */
#ifdef CONFIG_DELAY_ACCT
struct task_delay_info delays;
Index: linux-2.6.12/init/Kconfig
===================================================================
--- linux-2.6.12.orig/init/Kconfig
+++ linux-2.6.12/init/Kconfig
@@ -182,6 +182,16 @@ config CKRM_TYPE_TASKCLASS
Say Y if unsure
+config CKRM_RES_MEM
+ bool "Class based physical memory controller"
+ default y
+ depends on CKRM_TYPE_TASKCLASS
+ depends on !CONFIG_NUMA && !CONFIG_DISCONTIGMEM
+ help
+ Provide the basic support for collecting physical memory usage
+ information among classes. Say Y if you want to know the memory
+ usage of each class.
+
config CKRM_TYPE_SOCKETCLASS
bool "Class Manager for socket groups"
depends on CKRM && RCFS_FS
Index: linux-2.6.12/kernel/exit.c
===================================================================
--- linux-2.6.12.orig/kernel/exit.c
+++ linux-2.6.12/kernel/exit.c
@@ -32,6 +32,7 @@
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/ckrm_events.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -511,6 +512,7 @@ static void exit_mm(struct task_struct *
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
+ ckrm_task_mm_clear(tsk, mm);
enter_lazy_tlb(mm, current);
task_unlock(tsk);
mmput(mm);
Index: linux-2.6.12/kernel/fork.c
===================================================================
--- linux-2.6.12.orig/kernel/fork.c
+++ linux-2.6.12/kernel/fork.c
@@ -44,6 +44,7 @@
#include <linux/ckrm_events.h>
#include <linux/ckrm_tsk.h>
#include <linux/ckrm_tc.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -178,6 +179,7 @@ static struct task_struct *dup_task_stru
ti->task = tsk;
ckrm_cb_newtask(tsk);
+ ckrm_task_mm_init(tsk);
/* One for us, one for whoever does the "release_task()" (usually
parent) */
atomic_set(&tsk->usage,2);
return tsk;
@@ -326,6 +328,7 @@ static struct mm_struct * mm_init(struct
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx,
*mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
+ ckrm_mm_init(mm);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -346,6 +349,7 @@ struct mm_struct * mm_alloc(void)
if (mm) {
memset(mm, 0, sizeof(*mm));
mm = mm_init(mm);
+ ckrm_mm_setclass(mm, ckrm_get_mem_class(current));
}
return mm;
}
@@ -502,6 +506,8 @@ static int copy_mm(unsigned long clone_f
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
+ ckrm_mm_setclass(mm, oldmm->memclass);
+ ckrm_task_mm_set(mm, tsk);
return 0;
free_pt:
Index: linux-2.6.12/mm/page_alloc.c
===================================================================
--- linux-2.6.12.orig/mm/page_alloc.c
+++ linux-2.6.12/mm/page_alloc.c
@@ -34,6 +34,7 @@
#include <linux/cpuset.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -357,6 +358,7 @@ free_pages_bulk(struct zone *zone, int c
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, zone, order);
+ ckrm_clear_page_class(page);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -456,6 +458,7 @@ static void prep_new_page(struct page *p
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
page->private = 0;
+ ckrm_page_init(page);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);
}
@@ -751,6 +754,9 @@ __alloc_pages(unsigned int __nocast gfp_
*/
can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ if (!in_interrupt() && !ckrm_class_limit_ok(ckrm_get_mem_class(p)))
+ return NULL;
+
zones = zonelist->zones; /* the list of zones suitable for gfp_mask
*/
if (unlikely(zones[0] == NULL)) {
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- sekharan@us.ibm.com | .......you may get it.
----------------------------------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/6] CKRM: Basic changes to the core kernel
@ 2005-05-19 0:31 Chandra Seetharaman
0 siblings, 0 replies; 9+ messages in thread
From: Chandra Seetharaman @ 2005-05-19 0:31 UTC (permalink / raw)
To: ckrm-tech, linux-mm
Patch 1 of 6 patches to support memory controller under CKRM framework.
This patch has the basic changes needed to get the hooks in the appropriate
kernel functions to get control in the controller.
----------------------------------------
Following changes have been made since the last release:
- disable in NUMA and DISCONTIGMEM.
- remove the 'in_interrupt()' part in __alloc_pages()
- Remove the usage of PG_ckrm_account bit in the page flags.
----------------------------------------
fs/exec.c | 2 +
include/linux/ckrm_mem_inline.h | 67 ++++++++++++++++++++++++++++++++++++++++
include/linux/mm_inline.h | 7 ++++
include/linux/sched.h | 8 ++++
init/Kconfig | 10 +++++
kernel/exit.c | 2 +
kernel/fork.c | 6 +++
mm/page_alloc.c | 6 +++
8 files changed, 108 insertions(+)
Content-Disposition: inline; filename=11-01-mem_base_changes
Index: linux-2612-rc3/fs/exec.c
===================================================================
--- linux-2612-rc3.orig/fs/exec.c
+++ linux-2612-rc3/fs/exec.c
@@ -49,6 +49,7 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/ckrm_events.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -574,6 +575,7 @@ static int exec_mmap(struct mm_struct *m
activate_mm(active_mm, mm);
task_unlock(tsk);
arch_pick_mmap_layout(mm);
+ ckrm_task_mm_change(tsk, old_mm, mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
if (active_mm != old_mm) BUG();
Index: linux-2612-rc3/include/linux/ckrm_mem_inline.h
===================================================================
--- /dev/null
+++ linux-2612-rc3/include/linux/ckrm_mem_inline.h
@@ -0,0 +1,67 @@
+/* include/linux/ckrm_mem_inline.h : memory control for CKRM
+ *
+ * Copyright (C) Jiantao Kong, IBM Corp. 2003
+ * (C) Shailabh Nagar, IBM Corp. 2003
+ * (C) Chandra Seetharaman, IBM Corp. 2004
+ *
+ *
+ * Memory control functions of the CKRM kernel API
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_CKRM_MEM_INLINE_H_
+#define _LINUX_CKRM_MEM_INLINE_H_
+
+#ifdef CONFIG_CKRM_RES_MEM
+
+#error "Memory controller for CKRM is not available."
+
+#else
+
+static inline void
+ckrm_task_mm_init(struct task_struct *tsk)
+{
+}
+
+static inline void
+ckrm_task_mm_set(struct mm_struct * mm, struct task_struct *task)
+{
+}
+
+static inline void
+ckrm_task_mm_change(struct task_struct *tsk,
+ struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+}
+
+static inline void
+ckrm_task_mm_clear(struct task_struct *tsk, struct mm_struct *mm)
+{
+}
+
+static inline void
+ckrm_mm_init(struct mm_struct *mm)
+{
+}
+
+/* using #define instead of static inline as the prototype requires *
+ * data structures that is available only with the controller enabled */
+#define ckrm_mm_setclass(a, b) do { } while(0)
+#define ckrm_class_limit_ok(a) (1)
+
+static inline void ckrm_mem_inc_active(struct page *p) {}
+static inline void ckrm_mem_dec_active(struct page *p) {}
+static inline void ckrm_mem_inc_inactive(struct page *p) {}
+static inline void ckrm_mem_dec_inactive(struct page *p) {}
+static inline void ckrm_page_init(struct page *p) {}
+static inline void ckrm_clear_page_class(struct page *p) {}
+
+#endif
+#endif /* _LINUX_CKRM_MEM_INLINE_H_ */
Index: linux-2612-rc3/include/linux/mm_inline.h
===================================================================
--- linux-2612-rc3.orig/include/linux/mm_inline.h
+++ linux-2612-rc3/include/linux/mm_inline.h
@@ -1,9 +1,11 @@
+#include <linux/ckrm_mem_inline.h>
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->active_list);
zone->nr_active++;
+ ckrm_mem_inc_active(page);
}
static inline void
@@ -11,6 +13,7 @@ add_page_to_inactive_list(struct zone *z
{
list_add(&page->lru, &zone->inactive_list);
zone->nr_inactive++;
+ ckrm_mem_inc_inactive(page);
}
static inline void
@@ -18,6 +21,7 @@ del_page_from_active_list(struct zone *z
{
list_del(&page->lru);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
}
static inline void
@@ -25,6 +29,7 @@ del_page_from_inactive_list(struct zone
{
list_del(&page->lru);
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
static inline void
@@ -34,7 +39,9 @@ del_page_from_lru(struct zone *zone, str
if (PageActive(page)) {
ClearPageActive(page);
zone->nr_active--;
+ ckrm_mem_dec_active(page);
} else {
zone->nr_inactive--;
+ ckrm_mem_dec_inactive(page);
}
}
Index: linux-2612-rc3/include/linux/sched.h
===================================================================
--- linux-2612-rc3.orig/include/linux/sched.h
+++ linux-2612-rc3/include/linux/sched.h
@@ -268,6 +268,11 @@ struct mm_struct {
unsigned long hiwater_rss; /* High-water RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
+#ifdef CONFIG_CKRM_RES_MEM
+ struct ckrm_mem_res *memclass;
+ struct list_head tasklist; /* tasks sharing this address space */
+ spinlock_t peertask_lock; /* protect tasklist above */
+#endif
};
struct sighand_struct {
@@ -745,6 +750,9 @@ struct task_struct {
struct ckrm_task_class *taskclass;
struct list_head taskclass_link;
#endif /* CONFIG_CKRM_TYPE_TASKCLASS */
+#ifdef CONFIG_CKRM_RES_MEM
+ struct list_head mm_peers; /* list of tasks using same mm_struct */
+#endif
#endif /* CONFIG_CKRM */
#ifdef CONFIG_DELAY_ACCT
struct task_delay_info delays;
Index: linux-2612-rc3/init/Kconfig
===================================================================
--- linux-2612-rc3.orig/init/Kconfig
+++ linux-2612-rc3/init/Kconfig
@@ -182,6 +182,16 @@ config CKRM_TYPE_TASKCLASS
Say Y if unsure
+config CKRM_RES_MEM
+ bool "Class based physical memory controller"
+ default y
+ depends on CKRM_TYPE_TASKCLASS
+ depends on !CONFIG_NUMA && !CONFIG_DISCONTIGMEM
+ help
+ Provide the basic support for collecting physical memory usage
+ information among classes. Say Y if you want to know the memory
+ usage of each class.
+
config CKRM_TYPE_SOCKETCLASS
bool "Class Manager for socket groups"
depends on CKRM && RCFS_FS
Index: linux-2612-rc3/kernel/exit.c
===================================================================
--- linux-2612-rc3.orig/kernel/exit.c
+++ linux-2612-rc3/kernel/exit.c
@@ -31,6 +31,7 @@
#include <linux/cpuset.h>
#include <linux/syscalls.h>
#include <linux/ckrm_events.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -508,6 +509,7 @@ void exit_mm(struct task_struct * tsk)
task_lock(tsk);
tsk->mm = NULL;
up_read(&mm->mmap_sem);
+ ckrm_task_mm_clear(tsk, mm);
enter_lazy_tlb(mm, current);
task_unlock(tsk);
mmput(mm);
Index: linux-2612-rc3/kernel/fork.c
===================================================================
--- linux-2612-rc3.orig/kernel/fork.c
+++ linux-2612-rc3/kernel/fork.c
@@ -44,6 +44,7 @@
#include <linux/ckrm_events.h>
#include <linux/ckrm_tsk.h>
#include <linux/ckrm_tc.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -178,6 +179,7 @@ static struct task_struct *dup_task_stru
ti->task = tsk;
ckrm_cb_newtask(tsk);
+ ckrm_task_mm_init(tsk);
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
return tsk;
@@ -326,6 +328,7 @@ static struct mm_struct * mm_init(struct
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
+ ckrm_mm_init(mm);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -346,6 +349,7 @@ struct mm_struct * mm_alloc(void)
if (mm) {
memset(mm, 0, sizeof(*mm));
mm = mm_init(mm);
+ ckrm_mm_setclass(mm, ckrm_get_mem_class(current));
}
return mm;
}
@@ -502,6 +506,8 @@ static int copy_mm(unsigned long clone_f
good_mm:
tsk->mm = mm;
tsk->active_mm = mm;
+ ckrm_mm_setclass(mm, oldmm->memclass);
+ ckrm_task_mm_set(mm, tsk);
return 0;
free_pt:
Index: linux-2612-rc3/mm/page_alloc.c
===================================================================
--- linux-2612-rc3.orig/mm/page_alloc.c
+++ linux-2612-rc3/mm/page_alloc.c
@@ -34,6 +34,7 @@
#include <linux/cpuset.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
+#include <linux/ckrm_mem_inline.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -355,6 +356,7 @@ free_pages_bulk(struct zone *zone, int c
/* have to delete it as __free_pages_bulk list manipulates */
list_del(&page->lru);
__free_pages_bulk(page, zone, order);
+ ckrm_clear_page_class(page);
ret++;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -454,6 +456,7 @@ static void prep_new_page(struct page *p
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk);
page->private = 0;
+ ckrm_page_init(page);
set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1);
}
@@ -749,6 +752,9 @@ __alloc_pages(unsigned int __nocast gfp_
*/
can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ if (!ckrm_class_limit_ok(ckrm_get_mem_class(p)))
+ return NULL;
+
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (unlikely(zones[0] == NULL)) {
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2005-06-24 22:21 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-04-02 3:12 [PATCH 1/6] CKRM: Basic changes to the core kernel Chandra Seetharaman
2005-04-04 13:45 ` Dave Hansen
2005-04-05 17:25 ` Chandra Seetharaman
2005-04-05 17:54 ` Dave Hansen
2005-04-05 18:22 ` Chandra Seetharaman
2005-04-05 18:57 ` Dave Hansen
2005-04-05 19:38 ` Chandra Seetharaman
2005-05-19 0:31 Chandra Seetharaman
2005-06-24 22:21 Chandra Seetharaman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox