linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [Preview] [PATCH] radix tree based page cgroup [0/6]
@ 2008-03-05 11:51 KAMEZAWA Hiroyuki
  2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
                   ` (5 more replies)
  0 siblings, 6 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:51 UTC (permalink / raw)
  To: linux-mm; +Cc: balbir, xemul, hugh, yamamoto, taka

Hi, this is the latest version of radix-tree based page cgroup patch.

I post this now because recent major changes are included in 2.6.25-rc4.
(I admit I should do more tests on this set.)

Almost all are rewritten and adjusted to rc4's logic.
I feel this set is simpler than previous one.

Patch series is following.
[1/6] page cgroup definition
[2/6] patch against charge/uncharge 
[3/6] patch against move_list
[4/6] patch against migration
[5/6] radix tree based page_cgroup
[6/6] boost by per-cpu cache.

 * force_empty patch is dropped because it's unnecessary.
 * vmalloc patch is dropped. we always use kmalloc in this version.

TODO:
  - add freeing page_cgroup routine. it seems necessary sometimes.
    (I have one and will be added to this set in the next post.)
  - Logic check again.

Thanks,
-Kame


 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Preview] [PATCH] radix tree based page cgroup [1/6]
  2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
@ 2008-03-05 11:55 ` KAMEZAWA Hiroyuki
  2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge KAMEZAWA Hiroyuki
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:55 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka

 * Exporting page_cgroup definition.
 * Remove page_cgroup member from sturct page.
 * As result, PAGE_CGROUP_LOCK_BIT and assign/access functions are removed.

Other chages will appear in following patches.
There is a change in the structure itself, spin_lock is added.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


 include/linux/memcontrol.h  |   11 --------
 include/linux/mm_types.h    |    3 --
 include/linux/page_cgroup.h |   47 +++++++++++++++++++++++++++++++++++
 mm/memcontrol.c             |   59 --------------------------------------------
 4 files changed, 48 insertions(+), 72 deletions(-)

Index: linux-2.6.25-rc4/include/linux/page_cgroup.h
===================================================================
--- /dev/null
+++ linux-2.6.25-rc4/include/linux/page_cgroup.h
@@ -0,0 +1,47 @@
+#ifndef __LINUX_PAGE_CGROUP_H
+#define __LINUX_PAGE_CGROUP_H
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/*
+ * page_cgroup is yet another mem_map structure for accounting  usage.
+ * but, unlike mem_map, allocated on demand for accounted pages.
+ * see also memcontrol.h
+ * In nature, this cosumes much amount of memory.
+ */
+
+struct mem_cgroup;
+
+struct page_cgroup {
+	struct page 		*page;       /* the page this accounts for*/
+	struct mem_cgroup 	*mem_cgroup; /* current cgroup subsys */
+	int    			flags;	     /* See below */
+	int    			refcnt;      /* reference count */
+	spinlock_t		lock;        /* lock for all above members */
+	struct list_head 	lru;         /* for per cgroup LRU */
+};
+
+/* flags */
+#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache. */
+#define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* is on active list */
+
+/*
+ * Lookup and return page_cgroup struct.
+ * returns NULL when
+ * 1. Page Cgroup is not activated yet.
+ * 2. cannot lookup entry and allocate was false.
+ * return -ENOMEM if cannot allocate memory.
+ * If allocate==false, gfpmask will be ignored as a result.
+ */
+
+struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate);
+
+#else
+
+static struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+{
+	return NULL;
+}
+#endif
+#endif
Index: linux-2.6.25-rc4/include/linux/mm_types.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/mm_types.h
+++ linux-2.6.25-rc4/include/linux/mm_types.h
@@ -88,9 +88,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
 };
 
 /*
Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -30,6 +30,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/uaccess.h>
 
@@ -139,33 +140,6 @@ struct mem_cgroup {
 };
 static struct mem_cgroup init_mem_cgroup;
 
-/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 	0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK	0x0
-#endif
-
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-	struct list_head lru;		/* per cgroup LRU list */
-	struct page *page;
-	struct mem_cgroup *mem_cgroup;
-	int ref_cnt;			/* cached, mapped, migrating */
-	int flags;
-};
-#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
 
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
@@ -256,37 +230,6 @@ void mm_free_cgroup(struct mm_struct *mm
 	css_put(&mm->mem_cgroup->css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
-{
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
-{
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
-}
-
-struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
-}
-
-static void lock_page_cgroup(struct page *page)
-{
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static int try_lock_page_cgroup(struct page *page)
-{
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void unlock_page_cgroup(struct page *page)
-{
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
 static void __mem_cgroup_remove_list(struct page_cgroup *pc)
 {
 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
Index: linux-2.6.25-rc4/include/linux/memcontrol.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/memcontrol.h
+++ linux-2.6.25-rc4/include/linux/memcontrol.h
@@ -30,9 +30,6 @@ struct mm_struct;
 extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
 extern void mm_free_cgroup(struct mm_struct *mm);
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
-
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -82,14 +79,6 @@ static inline void mm_free_cgroup(struct
 {
 }
 
-static inline void page_reset_bad_cgroup(struct page *page)
-{
-}
-
-static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return NULL;
-}
 
 static inline int mem_cgroup_charge(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
Index: linux-2.6.25-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_alloc.c
+++ linux-2.6.25-rc4/mm/page_alloc.c
@@ -222,17 +222,11 @@ static inline int bad_range(struct zone 
 
 static void bad_page(struct page *page)
 {
-	void *pc = page_get_page_cgroup(page);
-
 	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
 		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		current->comm, page, (int)(2*sizeof(unsigned long)),
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
-	if (pc) {
-		printk(KERN_EMERG "cgroup:%p\n", pc);
-		page_reset_bad_cgroup(page);
-	}
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
@@ -460,7 +454,6 @@ static inline int free_pages_check(struc
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
@@ -610,7 +603,6 @@ static int prep_new_page(struct page *pa
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge
  2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
  2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
@ 2008-03-05 11:57 ` KAMEZAWA Hiroyuki
  2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka

Chagnges in Core Logic....charge and uncharge.

Because bit spin lock is removed and spinlock is added to page_cgroup.
There are some amount of changes.

This patch does
	- modified charge/uncharge
	- removed add_list/remove_list function. Just added stat functions
	- Added simple lock rule comments.

Major changes from current(rc4) version is
	- pc->refcnt is set to be "1" after the charge is done.

Changelog
  - Rebased to rc4

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


 mm/memcontrol.c |  136 +++++++++++++++++++++++++-------------------------------
 1 file changed, 62 insertions(+), 74 deletions(-)

Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -34,6 +34,16 @@
 
 #include <asm/uaccess.h>
 
+/*
+ * Lock Rule
+ * zone->lru_lcok (global LRU)
+ *	-> pc->lock (page_cgroup's lock)
+ *		-> mz->lru_lock (mem_cgroup's per_zone lock.)
+ *
+ * At least, mz->lru_lock and pc->lock should be acquired irq off.
+ *
+ */
+
 struct cgroup_subsys mem_cgroup_subsys;
 static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
 
@@ -476,33 +486,22 @@ static int mem_cgroup_charge_common(stru
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
 
+	pc = get_page_cgroup(page, gfp_mask, true);
+	if (!pc || IS_ERR(pc))
+		return PTR_ERR(pc);
+
+	spin_lock_irqsave(&pc->lock, flags);
 	/*
-	 * Should page_cgroup's go to their own slab?
-	 * One could optimize the performance of the charging routine
-	 * by saving a bit in the page_flags and using it as a lock
-	 * to see if the cgroup page already has a page_cgroup associated
-	 * with it
-	 */
-retry:
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	/*
-	 * The page_cgroup exists and
-	 * the page has already been accounted.
+	 * Has the page already been accounted ?
 	 */
-	if (pc) {
-		VM_BUG_ON(pc->page != page);
-		VM_BUG_ON(pc->ref_cnt <= 0);
-
-		pc->ref_cnt++;
-		unlock_page_cgroup(page);
-		goto done;
+	if (pc->refcnt > 0) {
+		pc->refcnt++;
+		spin_unlock_irqrestore(&pc->lock, flags);
+		goto success;
 	}
-	unlock_page_cgroup(page);
+	spin_unlock_irqrestore(&pc->lock, flags);
 
-	pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
-	if (pc == NULL)
-		goto err;
+	/* Note: pc->refcnt is still 0 here. */
 
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
@@ -523,7 +522,7 @@ retry:
 
 	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
 		if (!(gfp_mask & __GFP_WAIT))
-			goto out;
+			goto nomem;
 
 		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 			continue;
@@ -540,45 +539,40 @@ retry:
 
 		if (!nr_retries--) {
 			mem_cgroup_out_of_memory(mem, gfp_mask);
-			goto out;
+			goto nomem;
 		}
 		congestion_wait(WRITE, HZ/10);
 	}
-
-	pc->ref_cnt = 1;
+	/*
+ 	 * We have to acquire 2 spinlocks.
+	 */
+	spin_lock_irqsave(&pc->lock, flags);
+	if (pc->refcnt) {
+		/* Someone charged this page while we released the lock */
+		++pc->refcnt;
+		spin_unlock_irqrestore(&pc->lock, flags);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		css_put(&mem->css);
+		goto success;
+	}
+	/* Anyone doesn't touch this. */
+	VM_BUG_ON(pc->mem_cgroup);
+	VM_BUG_ON(!list_empty(&pc->lru));
+	pc->refcnt = 1;
 	pc->mem_cgroup = mem;
-	pc->page = page;
 	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
 		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
-
-	lock_page_cgroup(page);
-	if (page_get_page_cgroup(page)) {
-		unlock_page_cgroup(page);
-		/*
-		 * Another charge has been added to this page already.
-		 * We take lock_page_cgroup(page) again and read
-		 * page->cgroup, increment refcnt.... just retry is OK.
-		 */
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
-		css_put(&mem->css);
-		kfree(pc);
-		goto retry;
-	}
-	page_assign_page_cgroup(page, pc);
-
 	mz = page_cgroup_zoneinfo(pc);
-	spin_lock_irqsave(&mz->lru_lock, flags);
+	spin_lock(&mz->lru_lock);
 	__mem_cgroup_add_list(pc);
-	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	spin_unlock(&mz->lru_lock);
+	spin_unlock_irqrestore(&pc->lock, flags);
 
-	unlock_page_cgroup(page);
-done:
+success:
 	return 0;
-out:
+nomem:
 	css_put(&mem->css);
-	kfree(pc);
-err:
 	return -ENOMEM;
 }
 
@@ -611,33 +605,27 @@ void mem_cgroup_uncharge_page(struct pag
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
+	pc = get_page_cgroup(page, GFP_ATOMIC, false); /* No allocation */
 	if (!pc)
-		goto unlock;
-
-	VM_BUG_ON(pc->page != page);
-	VM_BUG_ON(pc->ref_cnt <= 0);
-
-	if (--(pc->ref_cnt) == 0) {
-		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_remove_list(pc);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-		page_assign_page_cgroup(page, NULL);
-		unlock_page_cgroup(page);
-
-		mem = pc->mem_cgroup;
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
-		css_put(&mem->css);
-
-		kfree(pc);
+		return;
+	spin_lock_irqsave(&pc->lock, flags);
+	if (!pc->refcnt || --pc->refcnt > 0) {
+		spin_unlock_irqrestore(&pc->lock, flags);
 		return;
 	}
+	VM_BUG_ON(pc->page != page);
+	mz = page_cgroup_zoneinfo(pc);
+	mem = pc->mem_cgroup;
 
-unlock:
-	unlock_page_cgroup(page);
+	spin_lock(&mz->lru_lock);
+	__mem_cgroup_remove_list(pc);
+	spin_unlock(&mz->lru_lock);
+
+	pc->flags = 0;
+	pc->mem_cgroup = 0;
+	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	css_put(&mem->css);
+	spin_unlock_irqrestore(&pc->lock, flags);
 }
 
 /*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists
  2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
  2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
  2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge KAMEZAWA Hiroyuki
@ 2008-03-05 11:57 ` KAMEZAWA Hiroyuki
  2008-03-05 12:01   ` [Preview] [PATCH] radix tree based page cgroup [6/6] boost by per-cpu KAMEZAWA Hiroyuki
  2008-03-05 12:00 ` [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup KAMEZAWA Hiroyuki
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 11:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka

Modifies mem_cgroup_move_lists() to use get_page_cgroup().
No major algorithm changes.

Signed-off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 mm/memcontrol.c |   16 +++++++++-------
 1 files changed, 9 insertions(+), 7 deletions(-)

Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -309,6 +309,10 @@ void mem_cgroup_move_lists(struct page *
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 
+	/* This GFP will be ignored..*/
+	pc = get_page_cgroup(page, GFP_ATOMIC, false);
+	if (!pc)
+		return;
 	/*
 	 * We cannot lock_page_cgroup while holding zone's lru_lock,
 	 * because other holders of lock_page_cgroup can be interrupted
@@ -316,17 +320,15 @@ void mem_cgroup_move_lists(struct page *
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	if (!spin_trylock_irqsave(&pc->lock, flags))
 		return;
-
-	pc = page_get_page_cgroup(page);
-	if (pc) {
+	if (pc->refcnt) {
 		mz = page_cgroup_zoneinfo(pc);
-		spin_lock_irqsave(&mz->lru_lock, flags);
+		spin_lock(&mz->lru_lock);
 		__mem_cgroup_move_lists(pc, active);
-		spin_unlock_irqrestore(&mz->lru_lock, flags);
+		spin_unlock(&mz->lru_lock);
 	}
-	unlock_page_cgroup(page);
+	spin_unlock_irqrestore(&pc->lock, flags);
 }
 
 /*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup
  2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
                   ` (2 preceding siblings ...)
  2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
@ 2008-03-05 12:00 ` KAMEZAWA Hiroyuki
  2008-03-06  0:28 ` [Preview] [PATCH] radix tree based page cgroup [4/6] migraton KAMEZAWA Hiroyuki
  2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
  5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 12:00 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka

A lookup routine for page_cgroup struct.

Now, page_cgroup is pointed by struct page's page_cgroup entry

struct page {
	...
	struct page_cgroup *page_cgroup;
	..
}

But some people dislike this because this increases sizeof(struct page).

For avoiding that, we'll have to add a lookup routine for
	pfn <-> page_cgroup.
by radix-tree.

New function is

struct page *get_page_cgroup(struct page *page, gfp_mask mask, bool allocate);

if (allocate == true), look up and allocate new one if necessary.
if (allocate == false), just do look up and return NULL if not exist.

Changes:
  - add the 3rd argument 'allocate'
  - making page_cgroup chunk size to be configurable (for test.)


Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 init/Kconfig     |   14 ++++
 mm/Makefile      |    2 
 mm/page_cgroup.c |  163 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 178 insertions(+), 1 deletion(-)

Index: linux-2.6.25-rc4/mm/page_cgroup.c
===================================================================
--- /dev/null
+++ linux-2.6.25-rc4/mm/page_cgroup.c
@@ -0,0 +1,169 @@
+/*
+ * page_cgroup mamagement codes.
+ * page_cgroup is yet another mem_map when cgroup's memory resoruce controller
+ * is activated. It containes information which cannot be stored in usual
+ * mem_map. (it's too big.)
+ * This allows us to keep 'struct page' small when a user doesn't activate
+ * memory resource controller.
+ *
+ * Note: all things are allocated on demand.
+ *
+ * We can translate : struct page <-> pfn -> page_cgroup -> struct page.
+ */
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/radix-tree.h>
+#include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
+#include <linux/err.h>
+
+
+
+#define PCGRP_SHIFT	(CONFIG_CGROUP_PAGE_CGROUP_ORDER)
+#define PCGRP_SIZE	(1 << PCGRP_SHIFT)
+
+struct page_cgroup_head {
+	struct page_cgroup pc[PCGRP_SIZE];
+};
+
+struct page_cgroup_root {
+	spinlock_t	       tree_lock;
+	struct radix_tree_root root_node;
+};
+
+static struct page_cgroup_root *root_dir[MAX_NUMNODES];
+
+static void init_page_cgroup(struct page_cgroup_head *head, unsigned long pfn)
+{
+	int i;
+	struct page_cgroup *pc;
+
+	memset(head, 0, sizeof(*head));
+	for (i = 0; i < PCGRP_SIZE; ++i) {
+		pc = &head->pc[i];
+		pc->page = pfn_to_page(pfn + i);
+		spin_lock_init(&pc->lock);
+		INIT_LIST_HEAD(&pc->lru);
+	}
+}
+
+
+struct kmem_cache *page_cgroup_cachep;
+
+static struct page_cgroup_head *
+alloc_init_page_cgroup(unsigned long pfn, int nid, gfp_t mask)
+{
+	struct page_cgroup_head *head;
+
+	head = kmem_cache_alloc_node(page_cgroup_cachep, mask, nid);
+	if (!head)
+		return NULL;
+
+	init_page_cgroup(head, pfn);
+
+	return head;
+}
+
+void free_page_cgroup(struct page_cgroup_head *head)
+{
+	kmem_cache_free(page_cgroup_cachep, head);
+}
+
+
+/*
+ * Look up page_cgroup struct for struct page (page's pfn)
+ * if (allocate == true), look up and allocate new one if necessary.
+ * if (allocate == false), look up and return NULL if it cannot be found.
+ */
+
+struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+{
+	struct page_cgroup_root *root;
+	struct page_cgroup_head *head;
+	struct page_cgroup *pc;
+	unsigned long pfn, idx;
+	int nid;
+	unsigned long base_pfn, flags;
+	int error;
+	
+	if (!page)
+		return NULL;
+
+	pfn = page_to_pfn(page);
+	idx = pfn >> PCGRP_SHIFT;
+	nid = page_to_nid(page);
+
+	root = root_dir[nid];
+	/* Before Init ? */
+	if (unlikely(!root))
+		return NULL;
+
+	base_pfn = idx << PCGRP_SHIFT;
+retry:
+	error = 0;
+	rcu_read_lock();
+	head = radix_tree_lookup(&root->root_node, idx);
+	rcu_read_unlock();
+
+	if (likely(head))
+		return &head->pc[pfn - base_pfn];
+	if (allocate == false)
+		return NULL;
+
+	/* Very Slow Path. On demand allocation. */
+	gfpmask = gfpmask & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
+
+	head = alloc_init_page_cgroup(base_pfn, nid, gfpmask);
+	if (!head)
+		return ERR_PTR(-ENOMEM);
+	pc = NULL;
+	error = radix_tree_preload(gfpmask);
+	if (error)
+		goto out;
+	spin_lock_irqsave(&root->tree_lock, flags);
+	error = radix_tree_insert(&root->root_node, idx, head);
+
+	if (!error)
+		pc = &head->pc[pfn - base_pfn];
+	spin_unlock_irqrestore(&root->tree_lock, flags);
+	radix_tree_preload_end();
+out:
+	if (!pc) {
+		free_page_cgroup(head);
+		if (error == -EEXIST)
+			goto retry;
+	}
+	if (error)
+		pc = ERR_PTR(error);
+	return pc;
+}
+
+__init int page_cgroup_init(void)
+{
+	int nid;
+	struct page_cgroup_root *root;
+
+	page_cgroup_cachep = kmem_cache_create("page_cgroup",
+				sizeof(struct page_cgroup_head), 0,
+				SLAB_PANIC | SLAB_DESTROY_BY_RCU, NULL);
+	if (!page_cgroup_cachep) {
+		printk(KERN_ERR "page accouning setup failure\n");
+		printk(KERN_ERR "can't initialize slab memory\n");
+		/* FIX ME: should return some error code ? */
+		return 0;
+	}
+	for_each_node(nid) {
+		root = kmalloc_node(sizeof(struct page_cgroup_root),
+					GFP_KERNEL, nid);
+		INIT_RADIX_TREE(&root->root_node, GFP_ATOMIC);
+		spin_lock_init(&root->tree_lock);
+		smp_wmb();
+		root_dir[nid] = root;
+	}
+
+	printk(KERN_INFO "Page Accouintg is activated\n");
+	return 0;
+}
+late_initcall(page_cgroup_init);
Index: linux-2.6.25-rc4/mm/Makefile
===================================================================
--- linux-2.6.25-rc4.orig/mm/Makefile
+++ linux-2.6.25-rc4/mm/Makefile
@@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 
Index: linux-2.6.25-rc4/init/Kconfig
===================================================================
--- linux-2.6.25-rc4.orig/init/Kconfig
+++ linux-2.6.25-rc4/init/Kconfig
@@ -407,6 +407,20 @@ config SYSFS_DEPRECATED_V2
 	  If you are using a distro with the most recent userspace
 	  packages, it should be safe to say N here.
 
+config CGROUP_PAGE_CGROUP_ORDER
+	int "Order of page accounting subsystem"
+	range 0 10
+	default 3 if HIGHMEM64G
+	default 10 if 64BIT
+	default 7
+	depends on CGROUP_MEM_RES_CTLR
+	help
+	  By making this value to be small, wastes in memory usage of page
+	  accounting can be small. But big number is good for perfomance.
+	  Especially, HIGHMEM64G users should keep this to be small because
+	  you tend to have small kernel memory.
+	  If unsure, use default.
+
 config PROC_PID_CPUSET
 	bool "Include legacy /proc/<pid>/cpuset file"
 	depends on CPUSETS

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Preview] [PATCH] radix tree based page cgroup [6/6] boost by per-cpu
  2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
@ 2008-03-05 12:01   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05 12:01 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka

This patch adds per-cpu look up cache for get_page_cgroup().
Works well when nearby pages are accessed continuously.

TODO: add flush routine.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


 include/linux/page_cgroup.h |   37 +++++++++++++++++++++++++++++++++++--
 mm/page_cgroup.c            |   23 ++++++++++++++++++-----
 2 files changed, 53 insertions(+), 7 deletions(-)

Index: linux-2.6.25-rc4/mm/page_cgroup.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_cgroup.c
+++ linux-2.6.25-rc4/mm/page_cgroup.c
@@ -17,11 +17,10 @@
 #include <linux/memcontrol.h>
 #include <linux/page_cgroup.h>
 #include <linux/err.h>
+#include <linux/interrupt.h>
 
 
-
-#define PCGRP_SHIFT	(CONFIG_CGROUP_PAGE_CGROUP_ORDER)
-#define PCGRP_SIZE	(1 << PCGRP_SHIFT)
+DEFINE_PER_CPU(struct page_cgroup_cache, pcpu_page_cgroup_cache);
 
 struct page_cgroup_head {
 	struct page_cgroup pc[PCGRP_SIZE];
@@ -71,6 +70,19 @@ void free_page_cgroup(struct page_cgroup
 }
 
 
+static void save_result(struct page_cgroup  *base, unsigned long idx)
+{
+	int hash = idx & (PAGE_CGROUP_NR_CACHE - 1);
+	struct page_cgroup_cache *pcp;
+	/* look up is done under preempt_disable(). then, don't call
+	   this under interrupt(). */
+	preempt_disable();
+	pcp = &__get_cpu_var(pcpu_page_cgroup_cache);
+	pcp->ents[hash].idx = idx;
+	pcp->ents[hash].base = base;
+	preempt_enable();
+}
+
 /*
  * Look up page_cgroup struct for struct page (page's pfn)
  * if (allocate == true), look up and allocate new one if necessary.
@@ -78,7 +90,7 @@ void free_page_cgroup(struct page_cgroup
  */
 
 struct page_cgroup *
-get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+__get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
 {
 	struct page_cgroup_root *root;
 	struct page_cgroup_head *head;
@@ -107,8 +119,12 @@ retry:
 	head = radix_tree_lookup(&root->root_node, idx);
 	rcu_read_unlock();
 
-	if (likely(head))
+	if (likely(head)) {
+		if (!in_interrupt())
+			save_result(&head->pc[0], idx);
 		return &head->pc[pfn - base_pfn];
+	}
+
 	if (allocate == false)
 		return NULL;
 
Index: linux-2.6.25-rc4/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/page_cgroup.h
+++ linux-2.6.25-rc4/include/linux/page_cgroup.h
@@ -24,6 +24,20 @@ struct page_cgroup {
 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache. */
 #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* is on active list */
 
+/* per cpu cashing for fast access */
+#define PAGE_CGROUP_NR_CACHE	(0x8)
+struct page_cgroup_cache {
+	struct {
+		unsigned long idx;
+		struct page_cgroup *base;
+	} ents[PAGE_CGROUP_NR_CACHE];
+};
+
+DECLARE_PER_CPU(struct page_cgroup_cache, pcpu_page_cgroup_cache);
+
+#define PCGRP_SHIFT	(CONFIG_CGROUP_PAGE_CGROUP_ORDER)
+#define PCGRP_SIZE	(1 << PCGRP_SHIFT)
+
 /*
  * Lookup and return page_cgroup struct.
  * returns NULL when
@@ -32,9 +46,28 @@ struct page_cgroup {
  * return -ENOMEM if cannot allocate memory.
  * If allocate==false, gfpmask will be ignored as a result.
  */
-
 struct page_cgroup *
-get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate);
+__get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate);
+
+static inline struct page_cgroup *
+get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
+{
+	unsigned long pfn = page_to_pfn(page);
+	struct page_cgroup_cache *pcp;
+	struct page_cgroup *ret;
+	unsigned long idx = pfn >> PCGRP_SHIFT;
+	int hnum = (idx) & (PAGE_CGROUP_NR_CACHE - 1);
+
+	preempt_disable();
+	pcp = &__get_cpu_var(pcpu_page_cgroup_cache);
+	if (pcp->ents[hnum].idx == idx && pcp->ents[hnum].base)
+		ret = pcp->ents[hnum].base + (pfn - (idx << PCGRP_SHIFT));
+	else
+		ret = NULL;
+	preempt_enable();
+
+	return (ret)? ret : __get_page_cgroup(page, gfpmask, allocate);
+}
 
 #else
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Preview] [PATCH] radix tree based page cgroup [4/6] migraton
  2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
                   ` (3 preceding siblings ...)
  2008-03-05 12:00 ` [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup KAMEZAWA Hiroyuki
@ 2008-03-06  0:28 ` KAMEZAWA Hiroyuki
  2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
  5 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06  0:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, xemul, hugh, yamamoto, taka

Sorry..this one has been slept on my PC...

For page migration.

Changes from current codes.
  - adds new arg to mem_cgroup_charge_common to pass mem_cgroup itself
    as its argument. This is used when mm is NULL.
  - igonore pc->refcnt == 0 case in prepare_migration
  - uncharge old page and add new charge to new page in page migration.

There is an algorithm change, so please see carefully....

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>


 mm/memcontrol.c |   91 +++++++++++++++++++++++++++++++-------------------------
 1 files changed, 51 insertions(+), 40 deletions(-)

Index: linux-2.6.25-rc4/mm/memcontrol.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/memcontrol.c
+++ linux-2.6.25-rc4/mm/memcontrol.c
@@ -480,7 +480,8 @@ unsigned long mem_cgroup_isolate_pages(u
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask, enum charge_type ctype)
+				gfp_t gfp_mask, enum charge_type ctype,
+				struct mem_cgroup *memcgrp)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
@@ -511,16 +512,21 @@ static int mem_cgroup_charge_common(stru
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
-	if (!mm)
+	if (!mm && !memcgrp) {
 		mm = &init_mm;
-
-	rcu_read_lock();
-	mem = rcu_dereference(mm->mem_cgroup);
-	/*
-	 * For every charge from the cgroup, increment reference count
-	 */
-	css_get(&mem->css);
-	rcu_read_unlock();
+	}
+	if (mm) {
+		rcu_read_lock();
+		mem = rcu_dereference(mm->mem_cgroup);
+		/*
+	 	* For every charge from the cgroup, increment reference count
+	 	*/
+		css_get(&mem->css);
+		rcu_read_unlock();
+	} else {
+		mem = memcgrp;
+		css_get(&mem->css);
+	}
 
 	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
 		if (!(gfp_mask & __GFP_WAIT))
@@ -581,7 +587,7 @@ nomem:
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
-				MEM_CGROUP_CHARGE_TYPE_MAPPED);
+				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
@@ -590,7 +596,7 @@ int mem_cgroup_cache_charge(struct page 
 	if (!mm)
 		mm = &init_mm;
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
-				MEM_CGROUP_CHARGE_TYPE_CACHE);
+				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 }
 
 /*
@@ -637,13 +643,20 @@ void mem_cgroup_uncharge_page(struct pag
 int mem_cgroup_prepare_migration(struct page *page)
 {
 	struct page_cgroup *pc;
-
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (pc)
-		pc->ref_cnt++;
-	unlock_page_cgroup(page);
-	return pc != NULL;
+	int ret = 0;
+	unsigned long flags;
+	/* returns NULL if not exist */
+	pc = get_page_cgroup(page, GFP_ATOMIC, false);
+	if (pc == NULL)
+		return ret;
+	
+	spin_lock_irqsave(&pc->lock, flags);
+	if (pc->refcnt) {
+		pc->refcnt++;
+		ret = 1;
+	}
+	spin_unlock_irqrestore(&pc->lock, flags);
+	return ret;
 }
 
 void mem_cgroup_end_migration(struct page *page)
@@ -655,38 +668,36 @@ void mem_cgroup_end_migration(struct pag
  * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
  * And no race with uncharge() routines because page_cgroup for *page*
  * has extra one reference by mem_cgroup_prepare_migration.
+ *
+ * This drops charge on old page and add new charge to new page.
+ * mem_cgroup is copied.
  */
 void mem_cgroup_page_migration(struct page *page, struct page *newpage)
 {
 	struct page_cgroup *pc;
-	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *mem = NULL;
 	unsigned long flags;
+	enum charge_type type;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (!pc) {
-		unlock_page_cgroup(page);
+	pc = get_page_cgroup(page, GFP_ATOMIC, false);
+	if (!pc)
 		return;
+	spin_lock_irqsave(&pc->lock, flags);
+	if (pc->refcnt) {
+		VM_BUG_ON(!pc->mem_cgroup);
+		mem = pc->mem_cgroup;
+		type = (pc->flags & PAGE_CGROUP_FLAG_CACHE)?
+			MEM_CGROUP_CHARGE_TYPE_CACHE :
+			MEM_CGROUP_CHARGE_TYPE_MAPPED;
+		css_get(&mem->css);
 	}
+	spin_unlock_irqrestore(&pc->lock, flags);
+	if (!mem)
+		return;
+	mem_cgroup_uncharge_page(page);
 
-	mz = page_cgroup_zoneinfo(pc);
-	spin_lock_irqsave(&mz->lru_lock, flags);
-	__mem_cgroup_remove_list(pc);
-	spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
-
-	pc->page = newpage;
-	lock_page_cgroup(newpage);
-	page_assign_page_cgroup(newpage, pc);
-
-	mz = page_cgroup_zoneinfo(pc);
-	spin_lock_irqsave(&mz->lru_lock, flags);
-	__mem_cgroup_add_list(pc);
-	spin_unlock_irqrestore(&mz->lru_lock, flags);
-
-	unlock_page_cgroup(newpage);
+	mem_cgroup_charge_common(newpage, NULL, GFP_ATOMIC, type, mem);
+	css_put(&mem->css);
 }
 
 /*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Preview] [PATCH] radix tree based page cgroup [0/6]
  2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
                   ` (4 preceding siblings ...)
  2008-03-06  0:28 ` [Preview] [PATCH] radix tree based page cgroup [4/6] migraton KAMEZAWA Hiroyuki
@ 2008-03-06 10:03 ` Hirokazu Takahashi
  2008-03-06 11:26   ` KAMEZAWA Hiroyuki
  5 siblings, 1 reply; 9+ messages in thread
From: Hirokazu Takahashi @ 2008-03-06 10:03 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: linux-mm, balbir, xemul, hugh, yamamoto

Hi,

> Hi, this is the latest version of radix-tree based page cgroup patch.
> 
> I post this now because recent major changes are included in 2.6.25-rc4.
> (I admit I should do more tests on this set.)
> 
> Almost all are rewritten and adjusted to rc4's logic.
> I feel this set is simpler than previous one.
> 
> Patch series is following.
> [1/6] page cgroup definition
> [2/6] patch against charge/uncharge 
> [3/6] patch against move_list
> [4/6] patch against migration
> [5/6] radix tree based page_cgroup
> [6/6] boost by per-cpu cache.
> 
>  * force_empty patch is dropped because it's unnecessary.
>  * vmalloc patch is dropped. we always use kmalloc in this version.
> 
> TODO:
>   - add freeing page_cgroup routine. it seems necessary sometimes.
>     (I have one and will be added to this set in the next post.)

I doubt page_cgroups can be freed effectively since most of the pages
are used and each of them has its corresponding page_cgroup when you
need more free memory.

In this case, right after some page_cgroup freed when the corresponding
pages are released, these pages are reallocated and page_cgroups are
also reallocated and assigned to them. It will only give us meaningless
overhead.

And I think it doesn't make sense to free page_cgroups to make much more
free memory if there are a lot of free memory,

I guess freeing page_cgroup routine will be fine when making hugetlb
pages.

>   - Logic check again.
> 
> Thanks,
> -Kame


Thanks,
Hirokazu Takahashi.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Preview] [PATCH] radix tree based page cgroup [0/6]
  2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
@ 2008-03-06 11:26   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 9+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06 11:26 UTC (permalink / raw)
  To: Hirokazu Takahashi; +Cc: linux-mm, balbir, xemul, hugh, yamamoto

On Thu, 06 Mar 2008 19:03:04 +0900 (JST)
Hirokazu Takahashi <taka@valinux.co.jp> wrote:

> I doubt page_cgroups can be freed effectively since most of the pages
> are used and each of them has its corresponding page_cgroup when you
> need more free memory.
> 
> In this case, right after some page_cgroup freed when the corresponding
> pages are released, these pages are reallocated and page_cgroups are
> also reallocated and assigned to them. It will only give us meaningless
> overhead.
> 
> And I think it doesn't make sense to free page_cgroups to make much more
> free memory if there are a lot of free memory,
> 
> I guess freeing page_cgroup routine will be fine when making hugetlb
> pages.
> 
This is current version. I feel this is reasonable and flexible approach, now.
But of course, I need more tests.

==
This patch is for freeing page_cgroup if a chunk of pages are freed.

Now under test. This works well, now

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsuc.com>


 include/linux/page_cgroup.h |   12 +++++++++
 mm/page_alloc.c             |    3 ++
 mm/page_cgroup.c            |   54 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

Index: linux-2.6.25-rc4/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.25-rc4.orig/include/linux/page_cgroup.h
+++ linux-2.6.25-rc4/include/linux/page_cgroup.h
@@ -38,6 +38,12 @@ DECLARE_PER_CPU(struct page_cgroup_cache
 #define PCGRP_SHIFT	(CONFIG_CGROUP_PAGE_CGROUP_ORDER)
 #define PCGRP_SIZE	(1 << PCGRP_SHIFT)
 
+#if PCGRP_SHIFT + 3 >= MAX_ORDER
+#define PCGRP_SHRINK_ORDER	(MAX_ORDER - 1)
+#else
+#define PCGRP_SHRINK_ORDER	(PCGRP_SHIFT + 3)
+#endif
+
 /*
  * Lookup and return page_cgroup struct.
  * returns NULL when
@@ -69,6 +75,8 @@ get_page_cgroup(struct page *page, gfp_t
 	return (ret)? ret : __get_page_cgroup(page, gfpmask, allocate);
 }
 
+void try_to_shrink_page_cgroup(struct page *page, int order);
+
 #else
 
 static struct page_cgroup *
@@ -76,5 +84,9 @@ get_page_cgroup(struct page *page, gfp_t
 {
 	return NULL;
 }
+static void try_to_shrink_page_cgroup(struct page *page, int order)
+{
+	return 0;
+}
 #endif
 #endif
Index: linux-2.6.25-rc4/mm/page_cgroup.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_cgroup.c
+++ linux-2.6.25-rc4/mm/page_cgroup.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/mm.h>
+#include <linux/mmzone.h>
 #include <linux/slab.h>
 #include <linux/radix-tree.h>
 #include <linux/memcontrol.h>
@@ -80,6 +81,7 @@ static void save_result(struct page_cgro
 	pcp = &__get_cpu_var(pcpu_page_cgroup_cache);
 	pcp->ents[hash].idx = idx;
 	pcp->ents[hash].base = base;
+	smp_wmb();
 	preempt_enable();
 }
 
@@ -156,6 +158,59 @@ out:
 	return pc;
 }
 
+/* Must be called under zone->lock */
+void try_to_shrink_page_cgroup(struct page *page, int order)
+{
+	unsigned long pfn = page_to_pfn(page);
+	int nid = page_to_nid(page);
+	int idx = pfn >> PCGRP_SHIFT;
+	int hnum = (PAGE_CGROUP_NR_CACHE - 1);
+	struct page_cgroup_cache *pcp;
+	struct page_cgroup_head *head;
+	struct page_cgroup_root *root;
+	unsigned long end_pfn;
+	int cpu;
+
+
+	root = root_dir[nid];
+	if (!root || in_interrupt() || (order < PCGRP_SHIFT))
+		return;
+
+	pfn = page_to_pfn(page);
+	end_pfn = pfn + (1 << order);
+
+	while (pfn != end_pfn) {
+		idx = pfn >> PCGRP_SHIFT;
+		/* Is this pfn has entry ? */
+		rcu_read_lock();
+		head = radix_tree_lookup(&root->root_node, idx);
+		rcu_read_unlock();
+		if (!head) {
+			pfn += (1 << PCGRP_SHIFT);
+			continue;
+		}
+		/* It's guaranteed that no one access to this pfn/idx
+		   because there is no reference to this page. */
+		hnum = (idx) & (PAGE_CGROUP_NR_CACHE - 1);
+		for_each_online_cpu(cpu) {
+			pcp = &per_cpu(pcpu_page_cgroup_cache, cpu);
+			smp_rmb();
+			if (pcp->ents[hnum].idx == idx)
+				pcp->ents[hnum].base = NULL;
+		}
+		if (spin_trylock(&root->tree_lock)) {
+			/* radix tree is freed by RCU. so they will not call
+			   free_pages() right now.*/
+			radix_tree_delete(&root->root_node, idx);
+			spin_unlock(&root->tree_lock);
+			/* We can free this in lazy fashion .*/
+			free_page_cgroup(head);
+			printk("free %ld\n",pfn);
+		}
+		pfn += (1 << PCGRP_SHIFT);
+	}
+}
+
 __init int page_cgroup_init(void)
 {
 	int nid;
Index: linux-2.6.25-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.25-rc4.orig/mm/page_alloc.c
+++ linux-2.6.25-rc4/mm/page_alloc.c
@@ -45,6 +45,7 @@
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -445,6 +446,8 @@ static inline void __free_one_page(struc
 		order++;
 	}
 	set_page_order(page, order);
+	if (order >= PCGRP_SHRINK_ORDER)
+		try_to_shrink_page_cgroup(page, order);
 	list_add(&page->lru,
 		&zone->free_area[order].free_list[migratetype]);
 	zone->free_area[order].nr_free++;



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-03-06 11:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-05 11:51 [Preview] [PATCH] radix tree based page cgroup [0/6] KAMEZAWA Hiroyuki
2008-03-05 11:55 ` [Preview] [PATCH] radix tree based page cgroup [1/6] KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [2/6] charge and uncharge KAMEZAWA Hiroyuki
2008-03-05 11:57 ` [Preview] [PATCH] radix tree based page cgroup [3/6] move_lists KAMEZAWA Hiroyuki
2008-03-05 12:01   ` [Preview] [PATCH] radix tree based page cgroup [6/6] boost by per-cpu KAMEZAWA Hiroyuki
2008-03-05 12:00 ` [Preview] [PATCH] radix tree based page cgroup [5/6] radix-tree-page-cgroup KAMEZAWA Hiroyuki
2008-03-06  0:28 ` [Preview] [PATCH] radix tree based page cgroup [4/6] migraton KAMEZAWA Hiroyuki
2008-03-06 10:03 ` [Preview] [PATCH] radix tree based page cgroup [0/6] Hirokazu Takahashi
2008-03-06 11:26   ` KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox