* [Preview] [PATCH] radix tree based page cgroup [0/6]
@ 2008-03-05 11:51 KAMEZAWA Hiroyuki
2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
` (5 more replies)
0 siblings, 6 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:51 UTC (permalink / raw)
To: linux-mm; +Cc: balbir, xemul, hugh, yamamoto, taka
Hi, this is the latest version of radix-tree based page cgroup patch.
I post this now because recent major changes are included in 2.6.25-rc4.
(I admit I should do more tests on this set.)
Almost all are rewritten and adjusted to rc4's logic.
I feel this set is simpler than previous one.
Patch series is following.
[1/6] page cgroup definition
[2/6] patch against charge/uncharge
[3/6] patch against move_list
[4/6] patch against migration
[5/6] radix tree based page_cgroup
[6/6] boost by per-cpu cache.
* force_empty patch is dropped because it's unnecessary.
* vmalloc patch is dropped. we always use kmalloc in this version.
TODO:
- add freeing page_cgroup routine. it seems necessary sometimes.
(I have one and will be added to this set in the next post.)
- Logic check again.
Thanks,
-Kame
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Preview] [PATCH] radix tree based page cgroup [1/6]
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
@ 2008-03-05 11:55 ` KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge KAMEZAWA Hiroyuki
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:55 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka
* Exporting page_cgroup definition.
* Remove page_cgroup member from sturct page.
* As result, PAGE_CGROUP_LOCK_BIT and assign/access functions are removed.
Other chages will appear in following patches.
There is a change in the structure itself, spin_lock is added.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
include/linux/memcontrol.h | 11 --------
include/linux/mm_types.h | 3 --
include/linux/page_cgroup.h | 47 +++++++++++++++++++++++++++++++++++
mm/memcontrol.c | 59 --------------------------------------------
4 files changed, 48 insertions(+), 72 deletions(-)
Index: linux-2.6.25-rc4/include/linux/page_cgroup.h
===================================================================
--- /dev/null
+++ linux-2.6.25-rc4/include/linux/page_cgroup.h
@@ -0,0 +1,47 @@
+#ifndef __LINUX_PAGE_CGROUP_H
+#define __LINUX_PAGE_CGROUP_H
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/*
+ * page_cgroup is yet another mem_map structure for accounting usage.
+ * but, unlike mem_map, allocated on demand for accounted pages.
+ * see also memcontrol.h
+ * In nature, this cosumes much amount of memory.
+ */
+
+struct mem_cgroup;
+
+struct page_cgroup {
+ struct page *page; /* the page this accounts for*/
+ struct mem_cgroup *mem_cgroup; /* current cgroup subsys */
+ int flags; /* See below */
+ int refcnt; /* reference count */
+ spinlock_t lock; /* lock for all above members */
+ struct list_head lru; /* for per cgroup LRU */
+};
+
+/* flags */
+#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache. */
+#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* is on active list */
+
+/*
+ * Lookup and return page_cgroup struct.
+ * returns NULL when
+ * 1. Page Cgroup is not activated yet.
+ * 2. cannot lookup entry and allocate was false.
+ * return -ENOMEM if cannot allocate memory.
+ * If allocate==false, gfpmask will be ignored as a result.
+ */
+
+struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate);
+
+#else
+
+static struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+{
+ return NULL;
+}
+#endif
+#endif
Index: linux-2.6.25-rc4/include/linux/mm_types.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/mm_types.h
+++ linux-2.6.25-rc4/include/linux/mm_types.h
@@ -88,9 +88,6 @@ struct page {
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
- unsigned long page_cgroup;
-#endif
};
/*
Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -30,6 +30,7 @@
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
+#include <linux/page_cgroup.h>
#include <asm/uaccess.h>
@@ -139,33 +140,6 @@ struct mem_cgroup {
};
static struct mem_cgroup init_mem_cgroup;
-/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock. We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin). But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK 0x0
-#endif
-
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
- struct list_head lru; /* per cgroup LRU list */
- struct page *page;
- struct mem_cgroup *mem_cgroup;
- int ref_cnt; /* cached, mapped, migrating */
- int flags;
-};
-#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
static int page_cgroup_nid(struct page_cgroup *pc)
{
@@ -256,37 +230,6 @@ void mm_free_cgroup(struct mm_struct *mm
css_put(&mm->mem_cgroup->css);
}
-static inline int page_cgroup_locked(struct page *page)
-{
- return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
-{
- VM_BUG_ON(!page_cgroup_locked(page));
- page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
-}
-
-struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
- return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
-}
-
-static void lock_page_cgroup(struct page *page)
-{
- bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static int try_lock_page_cgroup(struct page *page)
-{
- return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void unlock_page_cgroup(struct page *page)
-{
- bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
static void __mem_cgroup_remove_list(struct page_cgroup *pc)
{
int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
Index: linux-2.6.25-rc4/include/linux/memcontrol.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/memcontrol.h
+++ linux-2.6.25-rc4/include/linux/memcontrol.h
@@ -30,9 +30,6 @@ struct mm_struct;
extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
extern void mm_free_cgroup(struct mm_struct *mm);
-#define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0)
-
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask);
extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -82,14 +79,6 @@ static inline void mm_free_cgroup(struct
{
}
-static inline void page_reset_bad_cgroup(struct page *page)
-{
-}
-
-static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
- return NULL;
-}
static inline int mem_cgroup_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
Index: linux-2.6.25-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_alloc.c
+++ linux-2.6.25-rc4/mm/page_alloc.c
@@ -222,17 +222,11 @@ static inline int bad_range(struct zone
static void bad_page(struct page *page)
{
- void *pc = page_get_page_cgroup(page);
-
printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
current->comm, page, (int)(2*sizeof(unsigned long)),
(unsigned long)page->flags, page->mapping,
page_mapcount(page), page_count(page));
- if (pc) {
- printk(KERN_EMERG "cgroup:%p\n", pc);
- page_reset_bad_cgroup(page);
- }
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
KERN_EMERG "Backtrace:\n");
dump_stack();
@@ -460,7 +454,6 @@ static inline int free_pages_check(struc
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
- (page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
@@ -610,7 +603,6 @@ static int prep_new_page(struct page *pa
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
- (page_get_page_cgroup(page) != NULL) |
(page_count(page) != 0) |
(page->flags & (
1 << PG_lru |
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
@ 2008-03-05 11:57 ` KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:57 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka
Chagnges in Core Logic....charge and uncharge.
Because bit spin lock is removed and spinlock is added to page_cgroup.
There are some amount of changes.
This patch does
- modified charge/uncharge
- removed add_list/remove_list function. Just added stat functions
- Added simple lock rule comments.
Major changes from current(rc4) version is
- pc->refcnt is set to be "1" after the charge is done.
Changelog
- Rebased to rc4
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
mm/memcontrol.c | 136 +++++++++++++++++++++++++-------------------------------
1 file changed, 62 insertions(+), 74 deletions(-)
Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -34,6 +34,16 @@
#include <asm/uaccess.h>
+/*
+ * Lock Rule
+ * zone->lru_lcok (global LRU)
+ * -> pc->lock (page_cgroup's lock)
+ * -> mz->lru_lock (mem_cgroup's per_zone lock.)
+ *
+ * At least, mz->lru_lock and pc->lock should be acquired irq off.
+ *
+ */
+
struct cgroup_subsys mem_cgroup_subsys;
static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
@@ -476,33 +486,22 @@ static int mem_cgroup_charge_common(stru
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup_per_zone *mz;
+ pc = get_page_cgroup(page, gfp_mask, true);
+ if (!pc || IS_ERR(pc))
+ return PTR_ERR(pc);
+
+ spin_lock_irqsave(&pc->lock, flags);
/*
- * Should page_cgroup's go to their own slab?
- * One could optimize the performance of the charging routine
- * by saving a bit in the page_flags and using it as a lock
- * to see if the cgroup page already has a page_cgroup associated
- * with it
- */
-retry:
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- /*
- * The page_cgroup exists and
- * the page has already been accounted.
+ * Has the page already been accounted ?
*/
- if (pc) {
- VM_BUG_ON(pc->page != page);
- VM_BUG_ON(pc->ref_cnt <= 0);
-
- pc->ref_cnt++;
- unlock_page_cgroup(page);
- goto done;
+ if (pc->refcnt > 0) {
+ pc->refcnt++;
+ spin_unlock_irqrestore(&pc->lock, flags);
+ goto success;
}
- unlock_page_cgroup(page);
+ spin_unlock_irqrestore(&pc->lock, flags);
- pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
- if (pc == NULL)
- goto err;
+ /* Note: pc->refcnt is still 0 here. */
/*
* We always charge the cgroup the mm_struct belongs to.
@@ -523,7 +522,7 @@ retry:
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
if (!(gfp_mask & __GFP_WAIT))
- goto out;
+ goto nomem;
if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
continue;
@@ -540,45 +539,40 @@ retry:
if (!nr_retries--) {
mem_cgroup_out_of_memory(mem, gfp_mask);
- goto out;
+ goto nomem;
}
congestion_wait(WRITE, HZ/10);
}
-
- pc->ref_cnt = 1;
+ /*
+ * We have to acquire 2 spinlocks.
+ */
+ spin_lock_irqsave(&pc->lock, flags);
+ if (pc->refcnt) {
+ /* Someone charged this page while we released the lock */
+ ++pc->refcnt;
+ spin_unlock_irqrestore(&pc->lock, flags);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+ goto success;
+ }
+ /* Anyone doesn't touch this. */
+ VM_BUG_ON(pc->mem_cgroup);
+ VM_BUG_ON(!list_empty(&pc->lru));
+ pc->refcnt = 1;
pc->mem_cgroup = mem;
- pc->page = page;
pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
pc->flags |= PAGE_CGROUP_FLAG_CACHE;
-
- lock_page_cgroup(page);
- if (page_get_page_cgroup(page)) {
- unlock_page_cgroup(page);
- /*
- * Another charge has been added to this page already.
- * We take lock_page_cgroup(page) again and read
- * page->cgroup, increment refcnt.... just retry is OK.
- */
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- css_put(&mem->css);
- kfree(pc);
- goto retry;
- }
- page_assign_page_cgroup(page, pc);
-
mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
+ spin_lock(&mz->lru_lock);
__mem_cgroup_add_list(pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ spin_unlock(&mz->lru_lock);
+ spin_unlock_irqrestore(&pc->lock, flags);
- unlock_page_cgroup(page);
-done:
+success:
return 0;
-out:
+nomem:
css_put(&mem->css);
- kfree(pc);
-err:
return -ENOMEM;
}
@@ -611,33 +605,27 @@ void mem_cgroup_uncharge_page(struct pag
/*
* Check if our page_cgroup is valid
*/
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
+ pc = get_page_cgroup(page, GFP_ATOMIC, false); /* No allocation */
if (!pc)
- goto unlock;
-
- VM_BUG_ON(pc->page != page);
- VM_BUG_ON(pc->ref_cnt <= 0);
-
- if (--(pc->ref_cnt) == 0) {
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
-
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
-
- mem = pc->mem_cgroup;
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- css_put(&mem->css);
-
- kfree(pc);
+ return;
+ spin_lock_irqsave(&pc->lock, flags);
+ if (!pc->refcnt || --pc->refcnt > 0) {
+ spin_unlock_irqrestore(&pc->lock, flags);
return;
}
+ VM_BUG_ON(pc->page != page);
+ mz = page_cgroup_zoneinfo(pc);
+ mem = pc->mem_cgroup;
-unlock:
- unlock_page_cgroup(page);
+ spin_lock(&mz->lru_lock);
+ __mem_cgroup_remove_list(pc);
+ spin_unlock(&mz->lru_lock);
+
+ pc->flags = 0;
+ pc->mem_cgroup = 0;
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+ spin_unlock_irqrestore(&pc->lock, flags);
}
/*
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge KAMEZAWA Hiroyuki
@ 2008-03-05 11:57 ` KAMEZAWA Hiroyuki
2008-03-05 12:01 ` [Preview] [PATCH] radix tree based page cgroup [6/6] boost by per-cpu KAMEZAWA Hiroyuki
2008-03-05 12:00 ` [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup KAMEZAWA Hiroyuki
` (2 subsequent siblings)
5 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:57 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka
Modifies mem_cgroup_move_lists() to use get_page_cgroup().
No major algorithm changes.
Signed-off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
mm/memcontrol.c | 16 +++++++++-------
1 files changed, 9 insertions(+), 7 deletions(-)
Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -309,6 +309,10 @@ void mem_cgroup_move_lists(struct page *
struct mem_cgroup_per_zone *mz;
unsigned long flags;
+ /* This GFP will be ignored..*/
+ pc = get_page_cgroup(page, GFP_ATOMIC, false);
+ if (!pc)
+ return;
/*
* We cannot lock_page_cgroup while holding zone's lru_lock,
* because other holders of lock_page_cgroup can be interrupted
@@ -316,17 +320,15 @@ void mem_cgroup_move_lists(struct page *
* safely get to page_cgroup without it, so just try_lock it:
* mem_cgroup_isolate_pages allows for page left on wrong list.
*/
- if (!try_lock_page_cgroup(page))
+ if (!spin_trylock_irqsave(&pc->lock, flags))
return;
-
- pc = page_get_page_cgroup(page);
- if (pc) {
+ if (pc->refcnt) {
mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
+ spin_lock(&mz->lru_lock);
__mem_cgroup_move_lists(pc, active);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
+ spin_unlock(&mz->lru_lock);
}
- unlock_page_cgroup(page);
+ spin_unlock_irqrestore(&pc->lock, flags);
}
/*
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
` (2 preceding siblings ...)
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
@ 2008-03-05 12:00 ` KAMEZAWA Hiroyuki
2008-03-06 0:28 ` [Preview] [PATCH] radix tree based page cgroup [4/6] migraton KAMEZAWA Hiroyuki
2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 12:00 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka
A lookup routine for page_cgroup struct.
Now, page_cgroup is pointed by struct page's page_cgroup entry
struct page {
...
struct page_cgroup *page_cgroup;
..
}
But some people dislike this because this increases sizeof(struct page).
For avoiding that, we'll have to add a lookup routine for
pfn <-> page_cgroup.
by radix-tree.
New function is
struct page *get_page_cgroup(struct page *page, gfp_mask mask, bool allocate);
if (allocate == true), look up and allocate new one if necessary.
if (allocate == false), just do look up and return NULL if not exist.
Changes:
- add the 3rd argument 'allocate'
- making page_cgroup chunk size to be configurable (for test.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
init/Kconfig | 14 ++++
mm/Makefile | 2
mm/page_cgroup.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 178 insertions(+), 1 deletion(-)
Index: linux-2.6.25-rc4/mm/page_cgroup.c
===================================================================
--- /dev/null
+++ linux-2.6.25-rc4/mm/page_cgroup.c
@@ -0,0 +1,169 @@
+/*
+ * page_cgroup mamagement codes.
+ * page_cgroup is yet another mem_map when cgroup's memory resoruce controller
+ * is activated. It containes information which cannot be stored in usual
+ * mem_map. (it's too big.)
+ * This allows us to keep 'struct page' small when a user doesn't activate
+ * memory resource controller.
+ *
+ * Note: all things are allocated on demand.
+ *
+ * We can translate : struct page <-> pfn -> page_cgroup -> struct page.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
+#include <linux/err.h>
+
+
+
+#define PCGRP_SHIFT (CONFIG_CGROUP_PAGE_CGROUP_ORDER)
+#define PCGRP_SIZE (1 << PCGRP_SHIFT)
+
+struct page_cgroup_head {
+ struct page_cgroup pc[PCGRP_SIZE];
+};
+
+struct page_cgroup_root {
+ spinlock_t tree_lock;
+ struct radix_tree_root root_node;
+};
+
+static struct page_cgroup_root *root_dir[MAX_NUMNODES];
+
+static void init_page_cgroup(struct page_cgroup_head *head, unsigned long pfn)
+{
+ int i;
+ struct page_cgroup *pc;
+
+ memset(head, 0, sizeof(*head));
+ for (i = 0; i < PCGRP_SIZE; ++i) {
+ pc = &head->pc[i];
+ pc->page = pfn_to_page(pfn + i);
+ spin_lock_init(&pc->lock);
+ INIT_LIST_HEAD(&pc->lru);
+ }
+}
+
+
+struct kmem_cache *page_cgroup_cachep;
+
+static struct page_cgroup_head *
+alloc_init_page_cgroup(unsigned long pfn, int nid, gfp_t mask)
+{
+ struct page_cgroup_head *head;
+
+ head = kmem_cache_alloc_node(page_cgroup_cachep, mask, nid);
+ if (!head)
+ return NULL;
+
+ init_page_cgroup(head, pfn);
+
+ return head;
+}
+
+void free_page_cgroup(struct page_cgroup_head *head)
+{
+ kmem_cache_free(page_cgroup_cachep, head);
+}
+
+
+/*
+ * Look up page_cgroup struct for struct page (page's pfn)
+ * if (allocate == true), look up and allocate new one if necessary.
+ * if (allocate == false), look up and return NULL if it cannot be found.
+ */
+
+struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+{
+ struct page_cgroup_root *root;
+ struct page_cgroup_head *head;
+ struct page_cgroup *pc;
+ unsigned long pfn, idx;
+ int nid;
+ unsigned long base_pfn, flags;
+ int error;
+
+ if (!page)
+ return NULL;
+
+ pfn = page_to_pfn(page);
+ idx = pfn >> PCGRP_SHIFT;
+ nid = page_to_nid(page);
+
+ root = root_dir[nid];
+ /* Before Init ? */
+ if (unlikely(!root))
+ return NULL;
+
+ base_pfn = idx << PCGRP_SHIFT;
+retry:
+ error = 0;
+ rcu_read_lock();
+ head = radix_tree_lookup(&root->root_node, idx);
+ rcu_read_unlock();
+
+ if (likely(head))
+ return &head->pc[pfn - base_pfn];
+ if (allocate == false)
+ return NULL;
+
+ /* Very Slow Path. On demand allocation. */
+ gfpmask = gfpmask & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
+
+ head = alloc_init_page_cgroup(base_pfn, nid, gfpmask);
+ if (!head)
+ return ERR_PTR(-ENOMEM);
+ pc = NULL;
+ error = radix_tree_preload(gfpmask);
+ if (error)
+ goto out;
+ spin_lock_irqsave(&root->tree_lock, flags);
+ error = radix_tree_insert(&root->root_node, idx, head);
+
+ if (!error)
+ pc = &head->pc[pfn - base_pfn];
+ spin_unlock_irqrestore(&root->tree_lock, flags);
+ radix_tree_preload_end();
+out:
+ if (!pc) {
+ free_page_cgroup(head);
+ if (error == -EEXIST)
+ goto retry;
+ }
+ if (error)
+ pc = ERR_PTR(error);
+ return pc;
+}
+
+__init int page_cgroup_init(void)
+{
+ int nid;
+ struct page_cgroup_root *root;
+
+ page_cgroup_cachep = kmem_cache_create("page_cgroup",
+ sizeof(struct page_cgroup_head), 0,
+ SLAB_PANIC | SLAB_DESTROY_BY_RCU, NULL);
+ if (!page_cgroup_cachep) {
+ printk(KERN_ERR "page accouning setup failure\n");
+ printk(KERN_ERR "can't initialize slab memory\n");
+ /* FIX ME: should return some error code ? */
+ return 0;
+ }
+ for_each_node(nid) {
+ root = kmalloc_node(sizeof(struct page_cgroup_root),
+ GFP_KERNEL, nid);
+ INIT_RADIX_TREE(&root->root_node, GFP_ATOMIC);
+ spin_lock_init(&root->tree_lock);
+ smp_wmb();
+ root_dir[nid] = root;
+ }
+
+ printk(KERN_INFO "Page Accouintg is activated\n");
+ return 0;
+}
+late_initcall(page_cgroup_init);
Index: linux-2.6.25-rc4/mm/Makefile
===================================================================
--- linux-2.6.25-rc4.orig/mm/Makefile
+++ linux-2.6.25-rc4/mm/Makefile
@@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
Index: linux-2.6.25-rc4/init/Kconfig
===================================================================
--- linux-2.6.25-rc4.orig/init/Kconfig
+++ linux-2.6.25-rc4/init/Kconfig
@@ -407,6 +407,20 @@ config SYSFS_DEPRECATED_V2
If you are using a distro with the most recent userspace
packages, it should be safe to say N here.
+config CGROUP_PAGE_CGROUP_ORDER
+ int "Order of page accounting subsystem"
+ range 0 10
+ default 3 if HIGHMEM64G
+ default 10 if 64BIT
+ default 7
+ depends on CGROUP_MEM_RES_CTLR
+ help
+ By making this value to be small, wastes in memory usage of page
+ accounting can be small. But big number is good for perfomance.
+ Especially, HIGHMEM64G users should keep this to be small because
+ you tend to have small kernel memory.
+ If unsure, use default.
+
config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Preview] [PATCH] radix tree based page cgroup [6/6] boost by per-cpu
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
@ 2008-03-05 12:01 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 12:01 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka
This patch adds per-cpu look up cache for get_page_cgroup().
Works well when nearby pages are accessed continuously.
TODO: add flush routine.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
include/linux/page_cgroup.h | 37 +++++++++++++++++++++++++++++++++++--
mm/page_cgroup.c | 23 ++++++++++++++++++-----
2 files changed, 53 insertions(+), 7 deletions(-)
Index: linux-2.6.25-rc4/mm/page_cgroup.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_cgroup.c
+++ linux-2.6.25-rc4/mm/page_cgroup.c
@@ -17,11 +17,10 @@
#include <linux/memcontrol.h>
#include <linux/page_cgroup.h>
#include <linux/err.h>
+#include <linux/interrupt.h>
-
-#define PCGRP_SHIFT (CONFIG_CGROUP_PAGE_CGROUP_ORDER)
-#define PCGRP_SIZE (1 << PCGRP_SHIFT)
+DEFINE_PER_CPU(struct page_cgroup_cache, pcpu_page_cgroup_cache);
struct page_cgroup_head {
struct page_cgroup pc[PCGRP_SIZE];
@@ -71,6 +70,19 @@ void free_page_cgroup(struct page_cgroup
}
+static void save_result(struct page_cgroup *base, unsigned long idx)
+{
+ int hash = idx & (PAGE_CGROUP_NR_CACHE - 1);
+ struct page_cgroup_cache *pcp;
+ /* look up is done under preempt_disable(). then, don't call
+ this under interrupt(). */
+ preempt_disable();
+ pcp = &__get_cpu_var(pcpu_page_cgroup_cache);
+ pcp->ents[hash].idx = idx;
+ pcp->ents[hash].base = base;
+ preempt_enable();
+}
+
/*
* Look up page_cgroup struct for struct page (page's pfn)
* if (allocate == true), look up and allocate new one if necessary.
@@ -78,7 +90,7 @@ void free_page_cgroup(struct page_cgroup
*/
struct page_cgroup *
-get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+__get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
{
struct page_cgroup_root *root;
struct page_cgroup_head *head;
@@ -107,8 +119,12 @@ retry:
head = radix_tree_lookup(&root->root_node, idx);
rcu_read_unlock();
- if (likely(head))
+ if (likely(head)) {
+ if (!in_interrupt())
+ save_result(&head->pc[0], idx);
return &head->pc[pfn - base_pfn];
+ }
+
if (allocate == false)
return NULL;
Index: linux-2.6.25-rc4/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/page_cgroup.h
+++ linux-2.6.25-rc4/include/linux/page_cgroup.h
@@ -24,6 +24,20 @@ struct page_cgroup {
#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache. */
#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* is on active list */
+/* per cpu cashing for fast access */
+#define PAGE_CGROUP_NR_CACHE (0x8)
+struct page_cgroup_cache {
+ struct {
+ unsigned long idx;
+ struct page_cgroup *base;
+ } ents[PAGE_CGROUP_NR_CACHE];
+};
+
+DECLARE_PER_CPU(struct page_cgroup_cache, pcpu_page_cgroup_cache);
+
+#define PCGRP_SHIFT (CONFIG_CGROUP_PAGE_CGROUP_ORDER)
+#define PCGRP_SIZE (1 << PCGRP_SHIFT)
+
/*
* Lookup and return page_cgroup struct.
* returns NULL when
@@ -32,9 +46,28 @@ struct page_cgroup {
* return -ENOMEM if cannot allocate memory.
* If allocate==false, gfpmask will be ignored as a result.
*/
-
struct page_cgroup *
-get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate);
+__get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate);
+
+static inline struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+{
+ unsigned long pfn = page_to_pfn(page);
+ struct page_cgroup_cache *pcp;
+ struct page_cgroup *ret;
+ unsigned long idx = pfn >> PCGRP_SHIFT;
+ int hnum = (idx) & (PAGE_CGROUP_NR_CACHE - 1);
+
+ preempt_disable();
+ pcp = &__get_cpu_var(pcpu_page_cgroup_cache);
+ if (pcp->ents[hnum].idx == idx && pcp->ents[hnum].base)
+ ret = pcp->ents[hnum].base + (pfn - (idx << PCGRP_SHIFT));
+ else
+ ret = NULL;
+ preempt_enable();
+
+ return (ret)? ret : __get_page_cgroup(page, gfpmask, allocate);
+}
#else
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [Preview] [PATCH] radix tree based page cgroup [4/6] migraton
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
` (3 preceding siblings ...)
2008-03-05 12:00 ` [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup KAMEZAWA Hiroyuki
@ 2008-03-06 0:28 ` KAMEZAWA Hiroyuki
2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06 0:28 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka
Sorry..this one has been slept on my PC...
For page migration.
Changes from current codes.
- adds new arg to mem_cgroup_charge_common to pass mem_cgroup itself
as its argument. This is used when mm is NULL.
- igonore pc->refcnt == 0 case in prepare_migration
- uncharge old page and add new charge to new page in page migration.
There is an algorithm change, so please see carefully....
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
mm/memcontrol.c | 91 +++++++++++++++++++++++++++++++-------------------------
1 files changed, 51 insertions(+), 40 deletions(-)
Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -480,7 +480,8 @@ unsigned long mem_cgroup_isolate_pages(u
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype)
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcgrp)
{
struct mem_cgroup *mem;
struct page_cgroup *pc;
@@ -511,16 +512,21 @@ static int mem_cgroup_charge_common(stru
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (!mm)
+ if (!mm && !memcgrp) {
mm = &init_mm;
-
- rcu_read_lock();
- mem = rcu_dereference(mm->mem_cgroup);
- /*
- * For every charge from the cgroup, increment reference count
- */
- css_get(&mem->css);
- rcu_read_unlock();
+ }
+ if (mm) {
+ rcu_read_lock();
+ mem = rcu_dereference(mm->mem_cgroup);
+ /*
+ * For every charge from the cgroup, increment reference count
+ */
+ css_get(&mem->css);
+ rcu_read_unlock();
+ } else {
+ mem = memcgrp;
+ css_get(&mem->css);
+ }
while (res_counter_charge(&mem->res, PAGE_SIZE)) {
if (!(gfp_mask & __GFP_WAIT))
@@ -581,7 +587,7 @@ nomem:
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
}
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -590,7 +596,7 @@ int mem_cgroup_cache_charge(struct page
if (!mm)
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE);
+ MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
}
/*
@@ -637,13 +643,20 @@ void mem_cgroup_uncharge_page(struct pag
int mem_cgroup_prepare_migration(struct page *page)
{
struct page_cgroup *pc;
-
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- if (pc)
- pc->ref_cnt++;
- unlock_page_cgroup(page);
- return pc != NULL;
+ int ret = 0;
+ unsigned long flags;
+ /* returns NULL if not exist */
+ pc = get_page_cgroup(page, GFP_ATOMIC, false);
+ if (pc == NULL)
+ return ret;
+
+ spin_lock_irqsave(&pc->lock, flags);
+ if (pc->refcnt) {
+ pc->refcnt++;
+ ret = 1;
+ }
+ spin_unlock_irqrestore(&pc->lock, flags);
+ return ret;
}
void mem_cgroup_end_migration(struct page *page)
@@ -655,38 +668,36 @@ void mem_cgroup_end_migration(struct pag
* We know both *page* and *newpage* are now not-on-LRU and PG_locked.
* And no race with uncharge() routines because page_cgroup for *page*
* has extra one reference by mem_cgroup_prepare_migration.
+ *
+ * This drops charge on old page and add new charge to new page.
+ * mem_cgroup is copied.
*/
void mem_cgroup_page_migration(struct page *page, struct page *newpage)
{
struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *mem = NULL;
unsigned long flags;
+ enum charge_type type;
- lock_page_cgroup(page);
- pc = page_get_page_cgroup(page);
- if (!pc) {
- unlock_page_cgroup(page);
+ pc = get_page_cgroup(page, GFP_ATOMIC, false);
+ if (!pc)
return;
+ spin_lock_irqsave(&pc->lock, flags);
+ if (pc->refcnt) {
+ VM_BUG_ON(!pc->mem_cgroup);
+ mem = pc->mem_cgroup;
+ type = (pc->flags & PAGE_CGROUP_FLAG_CACHE)?
+ MEM_CGROUP_CHARGE_TYPE_CACHE :
+ MEM_CGROUP_CHARGE_TYPE_MAPPED;
+ css_get(&mem->css);
}
+ spin_unlock_irqrestore(&pc->lock, flags);
+ if (!mem)
+ return;
+ mem_cgroup_uncharge_page(page);
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_remove_list(pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
-
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
-
- pc->page = newpage;
- lock_page_cgroup(newpage);
- page_assign_page_cgroup(newpage, pc);
-
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
-
- unlock_page_cgroup(newpage);
+ mem_cgroup_charge_common(newpage, NULL, GFP_ATOMIC, type, mem);
+ css_put(&mem->css);
}
/*
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Preview] [PATCH] radix tree based page cgroup [0/6]
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
` (4 preceding siblings ...)
2008-03-06 0:28 ` [Preview] [PATCH] radix tree based page cgroup [4/6] migraton KAMEZAWA Hiroyuki
@ 2008-03-06 10:03 ` Hirokazu Takahashi
2008-03-06 11:26 ` KAMEZAWA Hiroyuki
5 siblings, 1 reply; 9+ messages in thread
From: Hirokazu Takahashi @ 2008-03-06 10:03 UTC (permalink / raw)
To: kamezawa.hiroyu; +Cc: linux-mm, balbir, xemul, hugh, yamamoto
Hi,
> Hi, this is the latest version of radix-tree based page cgroup patch.
>
> I post this now because recent major changes are included in 2.6.25-rc4.
> (I admit I should do more tests on this set.)
>
> Almost all are rewritten and adjusted to rc4's logic.
> I feel this set is simpler than previous one.
>
> Patch series is following.
> [1/6] page cgroup definition
> [2/6] patch against charge/uncharge
> [3/6] patch against move_list
> [4/6] patch against migration
> [5/6] radix tree based page_cgroup
> [6/6] boost by per-cpu cache.
>
> * force_empty patch is dropped because it's unnecessary.
> * vmalloc patch is dropped. we always use kmalloc in this version.
>
> TODO:
> - add freeing page_cgroup routine. it seems necessary sometimes.
> (I have one and will be added to this set in the next post.)
I doubt page_cgroups can be freed effectively since most of the pages
are used and each of them has its corresponding page_cgroup when you
need more free memory.
In this case, right after some page_cgroup freed when the corresponding
pages are released, these pages are reallocated and page_cgroups are
also reallocated and assigned to them. It will only give us meaningless
overhead.
And I think it doesn't make sense to free page_cgroups to make much more
free memory if there are a lot of free memory,
I guess freeing page_cgroup routine will be fine when making hugetlb
pages.
> - Logic check again.
>
> Thanks,
> -Kame
Thanks,
Hirokazu Takahashi.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [Preview] [PATCH] radix tree based page cgroup [0/6]
2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
@ 2008-03-06 11:26 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06 11:26 UTC (permalink / raw)
To: Hirokazu Takahashi; +Cc: linux-mm, balbir, xemul, hugh, yamamoto
On Thu, 06 Mar 2008 19:03:04 +0900 (JST)
Hirokazu Takahashi <taka@valinux.co.jp> wrote:
> I doubt page_cgroups can be freed effectively since most of the pages
> are used and each of them has its corresponding page_cgroup when you
> need more free memory.
>
> In this case, right after some page_cgroup freed when the corresponding
> pages are released, these pages are reallocated and page_cgroups are
> also reallocated and assigned to them. It will only give us meaningless
> overhead.
>
> And I think it doesn't make sense to free page_cgroups to make much more
> free memory if there are a lot of free memory,
>
> I guess freeing page_cgroup routine will be fine when making hugetlb
> pages.
>
This is current version. I feel this is reasonable and flexible approach, now.
But of course, I need more tests.
==
This patch is for freeing page_cgroup if a chunk of pages are freed.
Now under test. This works well, now
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsuc.com>
include/linux/page_cgroup.h | 12 +++++++++
mm/page_alloc.c | 3 ++
mm/page_cgroup.c | 54 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 69 insertions(+)
Index: linux-2.6.25-rc4/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/page_cgroup.h
+++ linux-2.6.25-rc4/include/linux/page_cgroup.h
@@ -38,6 +38,12 @@ DECLARE_PER_CPU(struct page_cgroup_cache
#define PCGRP_SHIFT (CONFIG_CGROUP_PAGE_CGROUP_ORDER)
#define PCGRP_SIZE (1 << PCGRP_SHIFT)
+#if PCGRP_SHIFT + 3 >= MAX_ORDER
+#define PCGRP_SHRINK_ORDER (MAX_ORDER - 1)
+#else
+#define PCGRP_SHRINK_ORDER (PCGRP_SHIFT + 3)
+#endif
+
/*
* Lookup and return page_cgroup struct.
* returns NULL when
@@ -69,6 +75,8 @@ get_page_cgroup(struct page *page, gfp_t
return (ret)? ret : __get_page_cgroup(page, gfpmask, allocate);
}
+void try_to_shrink_page_cgroup(struct page *page, int order);
+
#else
static struct page_cgroup *
@@ -76,5 +84,9 @@ get_page_cgroup(struct page *page, gfp_t
{
return NULL;
}
+static void try_to_shrink_page_cgroup(struct page *page, int order)
+{
+ return 0;
+}
#endif
#endif
Index: linux-2.6.25-rc4/mm/page_cgroup.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_cgroup.c
+++ linux-2.6.25-rc4/mm/page_cgroup.c
@@ -12,6 +12,7 @@
*/
#include <linux/mm.h>
+#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/radix-tree.h>
#include <linux/memcontrol.h>
@@ -80,6 +81,7 @@ static void save_result(struct page_cgro
pcp = &__get_cpu_var(pcpu_page_cgroup_cache);
pcp->ents[hash].idx = idx;
pcp->ents[hash].base = base;
+ smp_wmb();
preempt_enable();
}
@@ -156,6 +158,59 @@ out:
return pc;
}
+/* Must be called under zone->lock */
+void try_to_shrink_page_cgroup(struct page *page, int order)
+{
+ unsigned long pfn = page_to_pfn(page);
+ int nid = page_to_nid(page);
+ int idx = pfn >> PCGRP_SHIFT;
+ int hnum = (PAGE_CGROUP_NR_CACHE - 1);
+ struct page_cgroup_cache *pcp;
+ struct page_cgroup_head *head;
+ struct page_cgroup_root *root;
+ unsigned long end_pfn;
+ int cpu;
+
+
+ root = root_dir[nid];
+ if (!root || in_interrupt() || (order < PCGRP_SHIFT))
+ return;
+
+ pfn = page_to_pfn(page);
+ end_pfn = pfn + (1 << order);
+
+ while (pfn != end_pfn) {
+ idx = pfn >> PCGRP_SHIFT;
+ /* Is this pfn has entry ? */
+ rcu_read_lock();
+ head = radix_tree_lookup(&root->root_node, idx);
+ rcu_read_unlock();
+ if (!head) {
+ pfn += (1 << PCGRP_SHIFT);
+ continue;
+ }
+ /* It's guaranteed that no one access to this pfn/idx
+ because there is no reference to this page. */
+ hnum = (idx) & (PAGE_CGROUP_NR_CACHE - 1);
+ for_each_online_cpu(cpu) {
+ pcp = &per_cpu(pcpu_page_cgroup_cache, cpu);
+ smp_rmb();
+ if (pcp->ents[hnum].idx == idx)
+ pcp->ents[hnum].base = NULL;
+ }
+ if (spin_trylock(&root->tree_lock)) {
+ /* radix tree is freed by RCU. so they will not call
+ free_pages() right now.*/
+ radix_tree_delete(&root->root_node, idx);
+ spin_unlock(&root->tree_lock);
+ /* We can free this in lazy fashion .*/
+ free_page_cgroup(head);
+ printk("free %ld\n",pfn);
+ }
+ pfn += (1 << PCGRP_SHIFT);
+ }
+}
+
__init int page_cgroup_init(void)
{
int nid;
Index: linux-2.6.25-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_alloc.c
+++ linux-2.6.25-rc4/mm/page_alloc.c
@@ -45,6 +45,7 @@
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -445,6 +446,8 @@ static inline void __free_one_page(struc
order++;
}
set_page_order(page, order);
+ if (order >= PCGRP_SHRINK_ORDER)
+ try_to_shrink_page_cgroup(page, order);
list_add(&page->lru,
&zone->free_area[order].free_list[migratetype]);
zone->free_area[order].nr_free++;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-03-06 11:26 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
2008-03-05 12:01 ` [Preview] [PATCH] radix tree based page cgroup [6/6] boost by per-cpu KAMEZAWA Hiroyuki
2008-03-05 12:00 ` [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup KAMEZAWA Hiroyuki
2008-03-06 0:28 ` [Preview] [PATCH] radix tree based page cgroup [4/6] migraton KAMEZAWA Hiroyuki
2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
2008-03-06 11:26 ` KAMEZAWA Hiroyuki
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox