From mboxrd@z Thu Jan  1 00:00:00 1970
Message-ID: <47EC41E5.7090901@cn.fujitsu.com>
Date: Fri, 28 Mar 2008 08:55:01 +0800
From: Li Zefan <lizf@cn.fujitsu.com>
MIME-Version: 1.0
Subject: Re: [-mm] [PATCH 1/4] memcg : radix-tree page_cgroup
References: <20080327174435.e69f5b45.kamezawa.hiroyu@jp.fujitsu.com> <20080327174757.6ea57ea4.kamezawa.hiroyu@jp.fujitsu.com>
In-Reply-To: <20080327174757.6ea57ea4.kamezawa.hiroyu@jp.fujitsu.com>
Content-Type: text/plain; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit
Sender: owner-linux-mm@kvack.org
Return-Path: <owner-linux-mm@kvack.org>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>, "balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>, a.p.zijlstra@chello.nl
List-ID: <linux-mm.kvack.org>

KAMEZAWA Hiroyuki wrote:
> This patch implements radixt-tree based page cgroup.
> 
> This patch does
>  * add radix-tree based page_cgroup look up subsystem.page_cgroup_init
>  * remove bit_spin_lock used by page_cgroup.
> 
> changes:
> 
> Before patch
>  * struct page had pointer to page_cgroup. Then, relationship between objects
>    was pfn <-> struct page <-> struct page_cgroup
>  * (spin) lock for page_cgroup was in struct page.
>  * page_cgroup->refcnt is incremented before charge is done.
>  * page migration does complicated page_cgroup migration under locks.
> 
> After patch
>  * struct page has no pointer to page_cgroup. Relationship between objects
>    is struct page <-> pfn <-> struct page_cgroup -> struct page,
>  * page_cgroup has its own spin lock.
>  * page_cgroup->refcnt is incremented after charge is done.
>  * page migration accounts a new page before migration. By this, we can
>    avoid complicated locks. 
> 
> tested on ia64/NUMA, x86_64/SMP.
> 
> Changelog v1 -> v2:
>  * create a folded patch. maybe good for bysect.
>  * removed special handling codes for new pages under migration
>    Added PG_LRU check to force_empty.
>  * reflected comments.
>  * Added comments in the head of page_cgroup.c
>  * order of page_cgroup is automatically calculated.
>  * fixed handling of root_node[] entries in page_cgroup_init().
>  * rewrite init_page_cgroup_head() to do minimum work.
>  * fixed N_NORMAL_MEMORY handling.
>  
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 

At a first glance, some typos and small things, see below ;)

> 
>  include/linux/memcontrol.h  |   17 --
>  include/linux/mm_types.h    |    3 
>  include/linux/page_cgroup.h |   56 +++++++
>  mm/Makefile                 |    2 
>  mm/memcontrol.c             |  332 ++++++++++++++++----------------------------
>  mm/migrate.c                |   22 +-
>  mm/page_alloc.c             |    8 -
>  mm/page_cgroup.c            |  260 ++++++++++++++++++++++++++++++++++
>  8 files changed, 463 insertions(+), 237 deletions(-)
> 
> Index: linux-2.6.25-rc5-mm1-k/include/linux/page_cgroup.h
> ===================================================================
> --- /dev/null
> +++ linux-2.6.25-rc5-mm1-k/include/linux/page_cgroup.h
> @@ -0,0 +1,56 @@
> +#ifndef __LINUX_PAGE_CGROUP_H
> +#define __LINUX_PAGE_CGROUP_H
> +
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +/*
> + * page_cgroup is yet another mem_map structure for accounting  usage.

Not fix yes:
						     extra space ^^

> + * but, unlike mem_map, allocated on demand for accounted pages.
> + * see also memcontrol.h
> + * In nature, this consumes much amount of memory.
> + */
> +
> +struct mem_cgroup;
> +
> +struct page_cgroup {
> +	spinlock_t		lock;        /* lock for all members */
> +	int  			refcnt;      /* reference count */
> +	struct mem_cgroup 	*mem_cgroup; /* current cgroup subsys */
> +	struct list_head 	lru;         /* for per cgroup LRU */
> +	int    			flags;	     /* See below */
> +	struct page 		*page;       /* the page this accounts for*/
> +};
> +
> +/* flags */
> +#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache. */
> +#define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* is on active list */
> +
> +/*
> + * look up page_cgroup. returns NULL if not exists.
> + */
> +extern struct page_cgroup *get_page_cgroup(struct page *page);
> +
> +
> +/*
> + * look up page_cgroup, allocate new one if it doesn't exist.
> + * Return value is
> + *   1. page_cgroup, at success.
> + *   2. -EXXXXX, at failure.
> + *   3. NULL, at boot.
> + */
> +extern struct page_cgroup *
> +get_alloc_page_cgroup(struct page *page, gfp_t gfpmask);
> +
> +#else
> +
> +static inline struct page_cgroup *get_page_cgroup(struct page *page)
> +{
> +	return NULL;
> +}
> +
> +static inline struct page_cgroup *
> +get_alloc_page_cgroup(struct page *page, gfp_t gfpmask)
> +{
> +	return NULL;
> +}
> +#endif
> +#endif
> Index: linux-2.6.25-rc5-mm1-k/mm/memcontrol.c
> ===================================================================
> --- linux-2.6.25-rc5-mm1-k.orig/mm/memcontrol.c
> +++ linux-2.6.25-rc5-mm1-k/mm/memcontrol.c
> @@ -30,6 +30,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/fs.h>
>  #include <linux/seq_file.h>
> +#include <linux/page_cgroup.h>
>  
>  #include <asm/uaccess.h>
>  
> @@ -92,7 +93,7 @@ struct mem_cgroup_per_zone {
>  	/*
>  	 * spin_lock to protect the per cgroup LRU
>  	 */
> -	spinlock_t		lru_lock;
> +	spinlock_t		lru_lock;	/* irq should be off. */
>  	struct list_head	active_list;
>  	struct list_head	inactive_list;
>  	unsigned long count[NR_MEM_CGROUP_ZSTAT];
> @@ -139,33 +140,6 @@ struct mem_cgroup {
>  };
>  static struct mem_cgroup init_mem_cgroup;
>  
> -/*
> - * We use the lower bit of the page->page_cgroup pointer as a bit spin
> - * lock.  We need to ensure that page->page_cgroup is at least two
> - * byte aligned (based on comments from Nick Piggin).  But since
> - * bit_spin_lock doesn't actually set that lock bit in a non-debug
> - * uniprocessor kernel, we should avoid setting it here too.
> - */
> -#define PAGE_CGROUP_LOCK_BIT 	0x0
> -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
> -#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
> -#else
> -#define PAGE_CGROUP_LOCK	0x0
> -#endif
> -
> -/*
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - */
> -struct page_cgroup {
> -	struct list_head lru;		/* per cgroup LRU list */
> -	struct page *page;
> -	struct mem_cgroup *mem_cgroup;
> -	int ref_cnt;			/* cached, mapped, migrating */
> -	int flags;
> -};
> -#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
> -#define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
>  
>  static int page_cgroup_nid(struct page_cgroup *pc)
>  {
> @@ -256,37 +230,6 @@ void mm_free_cgroup(struct mm_struct *mm
>  	css_put(&mm->mem_cgroup->css);
>  }
>  
> -static inline int page_cgroup_locked(struct page *page)
> -{
> -	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
> -{
> -	VM_BUG_ON(!page_cgroup_locked(page));
> -	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
> -}
> -
> -struct page_cgroup *page_get_page_cgroup(struct page *page)
> -{
> -	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
> -}
> -
> -static void lock_page_cgroup(struct page *page)
> -{
> -	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static int try_lock_page_cgroup(struct page *page)
> -{
> -	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
> -static void unlock_page_cgroup(struct page *page)
> -{
> -	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
> -}
> -
>  static void __mem_cgroup_remove_list(struct page_cgroup *pc)
>  {
>  	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
> @@ -356,6 +299,10 @@ void mem_cgroup_move_lists(struct page *
>  	struct mem_cgroup_per_zone *mz;
>  	unsigned long flags;
>  
> +	/* This GFP will be ignored..*/
> +	pc = get_page_cgroup(page);
> +	if (!pc)
> +		return;
>  	/*
>  	 * We cannot lock_page_cgroup while holding zone's lru_lock,
>  	 * because other holders of lock_page_cgroup can be interrupted
> @@ -363,17 +310,15 @@ void mem_cgroup_move_lists(struct page *
>  	 * safely get to page_cgroup without it, so just try_lock it:
>  	 * mem_cgroup_isolate_pages allows for page left on wrong list.
>  	 */
> -	if (!try_lock_page_cgroup(page))
> +	if (!spin_trylock_irqsave(&pc->lock, flags))
>  		return;
> -
> -	pc = page_get_page_cgroup(page);
> -	if (pc) {
> +	if (pc->refcnt) {
>  		mz = page_cgroup_zoneinfo(pc);
> -		spin_lock_irqsave(&mz->lru_lock, flags);
> +		spin_lock(&mz->lru_lock);
>  		__mem_cgroup_move_lists(pc, active);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> +		spin_unlock(&mz->lru_lock);
>  	}
> -	unlock_page_cgroup(page);
> +	spin_unlock_irqrestore(&pc->lock, flags);
>  }
>  
>  /*
> @@ -525,7 +470,8 @@ unsigned long mem_cgroup_isolate_pages(u
>   * < 0 if the cgroup is over its limit
>   */
>  static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
> -				gfp_t gfp_mask, enum charge_type ctype)
> +				gfp_t gfp_mask, enum charge_type ctype,
> +				struct mem_cgroup *memcg)
>  {
>  	struct mem_cgroup *mem;
>  	struct page_cgroup *pc;
> @@ -536,33 +482,23 @@ static int mem_cgroup_charge_common(stru
>  	if (mem_cgroup_subsys.disabled)
>  		return 0;
>  
> +	pc = get_alloc_page_cgroup(page, gfp_mask);
> +	/* Before kamalloc initialization, get_page_cgroup can retrun NULL */

		  kmalloc				       return

> +	if (unlikely(!pc || IS_ERR(pc)))
> +		return PTR_ERR(pc);
> +
> +	spin_lock_irqsave(&pc->lock, flags);
>  	/*
> -	 * Should page_cgroup's go to their own slab?
> -	 * One could optimize the performance of the charging routine
> -	 * by saving a bit in the page_flags and using it as a lock
> -	 * to see if the cgroup page already has a page_cgroup associated
> -	 * with it
> -	 */
> -retry:
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> -	/*
> -	 * The page_cgroup exists and
> -	 * the page has already been accounted.
> +	 * Has the page already been accounted ?
>  	 */
> -	if (pc) {
> -		VM_BUG_ON(pc->page != page);
> -		VM_BUG_ON(pc->ref_cnt <= 0);
> -
> -		pc->ref_cnt++;
> -		unlock_page_cgroup(page);
> -		goto done;
> +	if (pc->refcnt > 0) {
> +		pc->refcnt++;
> +		spin_unlock_irqrestore(&pc->lock, flags);
> +		goto success;
>  	}
> -	unlock_page_cgroup(page);
> +	spin_unlock_irqrestore(&pc->lock, flags);
>  
> -	pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
> -	if (pc == NULL)
> -		goto err;
> +	/* Note: *new* pc's refcnt is still 0 here. */
>  
>  	/*
>  	 * We always charge the cgroup the mm_struct belongs to.
> @@ -570,20 +506,24 @@ retry:
>  	 * thread group leader migrates. It's possible that mm is not
>  	 * set, if so charge the init_mm (happens for pagecache usage).
>  	 */
> -	if (!mm)
> -		mm = &init_mm;
> -
> -	rcu_read_lock();
> -	mem = rcu_dereference(mm->mem_cgroup);
> -	/*
> -	 * For every charge from the cgroup, increment reference count
> -	 */
> -	css_get(&mem->css);
> -	rcu_read_unlock();
> +	if (memcg) {
> +		mem = memcg;
> +		css_get(&mem->css);
> +	} else {
> +		if (!mm)
> +			mm = &init_mm;
> +		rcu_read_lock();
> +		mem = rcu_dereference(mm->mem_cgroup);
> +		/*
> +		 * For every charge from the cgroup, increment reference count
> +		 */
> +		css_get(&mem->css);
> +		rcu_read_unlock();
> +	}
>  
>  	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
>  		if (!(gfp_mask & __GFP_WAIT))
> -			goto out;
> +			goto nomem;
>  
>  		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
>  			continue;
> @@ -600,52 +540,51 @@ retry:
>  
>  		if (!nr_retries--) {
>  			mem_cgroup_out_of_memory(mem, gfp_mask);
> -			goto out;
> +			goto nomem;
>  		}
>  		congestion_wait(WRITE, HZ/10);
>  	}
> -
> -	pc->ref_cnt = 1;
> -	pc->mem_cgroup = mem;
> -	pc->page = page;
> -	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> -	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
> -		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> -
> -	lock_page_cgroup(page);
> -	if (page_get_page_cgroup(page)) {
> -		unlock_page_cgroup(page);
> -		/*
> -		 * Another charge has been added to this page already.
> -		 * We take lock_page_cgroup(page) again and read
> -		 * page->cgroup, increment refcnt.... just retry is OK.
> -		 */
> +	/*
> +	 * We have to acquire 2 spinlocks.
> +	 */
> +	spin_lock_irqsave(&pc->lock, flags);
> +	/* Is anyone charged ? */
> +	if (unlikely(pc->refcnt)) {
> +		/* Someone charged this page while we released the lock */
> +		pc->refcnt++;
> +		spin_unlock_irqrestore(&pc->lock, flags);
>  		res_counter_uncharge(&mem->res, PAGE_SIZE);
>  		css_put(&mem->css);
> -		kfree(pc);
> -		goto retry;
> +		goto success;
>  	}
> -	page_assign_page_cgroup(page, pc);
> +	/* Anyone doesn't touch this. */
> +	VM_BUG_ON(pc->mem_cgroup);
> +
> +	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
> +		pc->flags = PAGE_CGROUP_FLAG_ACTIVE | PAGE_CGROUP_FLAG_CACHE;
> +	else
> +		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
> +	pc->refcnt = 1;
> +	pc->mem_cgroup = mem;
>  
>  	mz = page_cgroup_zoneinfo(pc);
> -	spin_lock_irqsave(&mz->lru_lock, flags);
> +
> +	spin_lock(&mz->lru_lock);
>  	__mem_cgroup_add_list(pc);
> -	spin_unlock_irqrestore(&mz->lru_lock, flags);
> +	spin_unlock(&mz->lru_lock);
> +	spin_unlock_irqrestore(&pc->lock, flags);
>  
> -	unlock_page_cgroup(page);
> -done:
> +success:
>  	return 0;
> -out:
> +nomem:
>  	css_put(&mem->css);
> -	kfree(pc);
> -err:
>  	return -ENOMEM;
>  }
>  
>  int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
>  {
>  	return mem_cgroup_charge_common(page, mm, gfp_mask,
> -				MEM_CGROUP_CHARGE_TYPE_MAPPED);
> +				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
>  }
>  
>  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> @@ -654,7 +593,7 @@ int mem_cgroup_cache_charge(struct page 
>  	if (!mm)
>  		mm = &init_mm;
>  	return mem_cgroup_charge_common(page, mm, gfp_mask,
> -				MEM_CGROUP_CHARGE_TYPE_CACHE);
> +				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
>  }
>  
>  /*
> @@ -664,105 +603,87 @@ int mem_cgroup_cache_charge(struct page 
>  void mem_cgroup_uncharge_page(struct page *page)
>  {
>  	struct page_cgroup *pc;
> -	struct mem_cgroup *mem;
> -	struct mem_cgroup_per_zone *mz;
> -	unsigned long flags;
>  
>  	if (mem_cgroup_subsys.disabled)
>  		return;
> -
>  	/*
>  	 * Check if our page_cgroup is valid
>  	 */
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> -	if (!pc)
> -		goto unlock;
> -
> -	VM_BUG_ON(pc->page != page);
> -	VM_BUG_ON(pc->ref_cnt <= 0);
> +	pc = get_page_cgroup(page);
> +	if (likely(pc)) {
> +		unsigned long flags;
> +		struct mem_cgroup *mem;
> +		struct mem_cgroup_per_zone *mz;
>  
> -	if (--(pc->ref_cnt) == 0) {
> +		spin_lock_irqsave(&pc->lock, flags);
> +		if (!pc->refcnt || --pc->refcnt > 0) {
> +			spin_unlock_irqrestore(&pc->lock, flags);
> +			return;
> +		}
> +		mem = pc->mem_cgroup;
>  		mz = page_cgroup_zoneinfo(pc);
> -		spin_lock_irqsave(&mz->lru_lock, flags);
> +		spin_lock(&mz->lru_lock);
>  		__mem_cgroup_remove_list(pc);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -		page_assign_page_cgroup(page, NULL);
> -		unlock_page_cgroup(page);
> +		spin_unlock(&mz->lru_lock);
> +		pc->flags = 0;
> +		pc->mem_cgroup = 0;
> +		spin_unlock_irqrestore(&pc->lock, flags);
>  
> -		mem = pc->mem_cgroup;
>  		res_counter_uncharge(&mem->res, PAGE_SIZE);
>  		css_put(&mem->css);
> -
> -		kfree(pc);
> -		return;
>  	}
> -
> -unlock:
> -	unlock_page_cgroup(page);
>  }
>  
>  /*
> - * Returns non-zero if a page (under migration) has valid page_cgroup member.
> - * Refcnt of page_cgroup is incremented.
> + * Pre-charge against newpage while moving a page.
> + * This function is called before taking page locks.
>   */
> -int mem_cgroup_prepare_migration(struct page *page)
> +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
>  {
>  	struct page_cgroup *pc;
> +	struct mem_cgroup *mem = NULL;
> +	int ret = 0;
> +	enum charge_type type = 0;

don't initialize an enum variable with integer value.

> +	unsigned long flags;
>  
>  	if (mem_cgroup_subsys.disabled)
> -		return 0;
> +		return ret;
>  
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> -	if (pc)
> -		pc->ref_cnt++;
> -	unlock_page_cgroup(page);
> -	return pc != NULL;
> -}
> +	/* check newpage isn't under memory resource control */
> +	pc = get_page_cgroup(newpage);
> +	VM_BUG_ON(pc && pc->refcnt);
>  
> -void mem_cgroup_end_migration(struct page *page)
> -{
> -	mem_cgroup_uncharge_page(page);
> -}
> +	pc = get_page_cgroup(page);
>  
> +	if (pc) {
> +		spin_lock_irqsave(&pc->lock, flags);
> +		if (pc->refcnt) {
> +			mem = pc->mem_cgroup;
> +			css_get(&mem->css);
> +			if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
> +				type = MEM_CGROUP_CHARGE_TYPE_CACHE;
> +			else
> +				type = MEM_CGROUP_CHARGE_TYPE_MAPPED;
> +		}
> +		spin_unlock_irqrestore(&pc->lock, flags);
> +		if (mem) {
> +			ret = mem_cgroup_charge_common(newpage, NULL,
> +				GFP_KERNEL, type, mem);
> +			css_put(&mem->css);
> +		}
> +	}
> +	return ret;
> +}
>  /*
> - * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
> - * And no race with uncharge() routines because page_cgroup for *page*
> - * has extra one reference by mem_cgroup_prepare_migration.
> + * At the end of migration, we'll push newpage to LRU and
> + * drop one refcnt which added at prepare_migration.
>   */
> -void mem_cgroup_page_migration(struct page *page, struct page *newpage)
> +void mem_cgroup_end_migration(struct page *newpage)
>  {
> -	struct page_cgroup *pc;
> -	struct mem_cgroup_per_zone *mz;
> -	unsigned long flags;
> -
> -	lock_page_cgroup(page);
> -	pc = page_get_page_cgroup(page);
> -	if (!pc) {
> -		unlock_page_cgroup(page);
> +	if (mem_cgroup_subsys.disabled)
>  		return;
> -	}
>  
> -	mz = page_cgroup_zoneinfo(pc);
> -	spin_lock_irqsave(&mz->lru_lock, flags);
> -	__mem_cgroup_remove_list(pc);
> -	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	page_assign_page_cgroup(page, NULL);
> -	unlock_page_cgroup(page);
> -
> -	pc->page = newpage;
> -	lock_page_cgroup(newpage);
> -	page_assign_page_cgroup(newpage, pc);
> -
> -	mz = page_cgroup_zoneinfo(pc);
> -	spin_lock_irqsave(&mz->lru_lock, flags);
> -	__mem_cgroup_add_list(pc);
> -	spin_unlock_irqrestore(&mz->lru_lock, flags);
> -
> -	unlock_page_cgroup(newpage);
> +	mem_cgroup_uncharge_page(newpage);
>  }
>  
>  /*
> @@ -790,10 +711,13 @@ static void mem_cgroup_force_empty_list(
>  	while (!list_empty(list)) {
>  		pc = list_entry(list->prev, struct page_cgroup, lru);
>  		page = pc->page;
> -		get_page(page);
> -		spin_unlock_irqrestore(&mz->lru_lock, flags);
> -		mem_cgroup_uncharge_page(page);
> -		put_page(page);
> +		if (PageLRU(page)) {
> +			get_page(page);
> +			spin_unlock_irqrestore(&mz->lru_lock, flags);
> +			mem_cgroup_uncharge_page(page);
> +			put_page(page);
> +		} else
> +			count = 0;
>  		if (--count <= 0) {
>  			count = FORCE_UNCHARGE_BATCH;
>  			cond_resched();
> Index: linux-2.6.25-rc5-mm1-k/include/linux/memcontrol.h
> ===================================================================
> --- linux-2.6.25-rc5-mm1-k.orig/include/linux/memcontrol.h
> +++ linux-2.6.25-rc5-mm1-k/include/linux/memcontrol.h
> @@ -19,6 +19,7 @@
>  
>  #ifndef _LINUX_MEMCONTROL_H
>  #define _LINUX_MEMCONTROL_H
> +#include <linux/page_cgroup.h>
>  
>  struct mem_cgroup;
>  struct page_cgroup;
> @@ -30,9 +31,6 @@ struct mm_struct;
>  extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
>  extern void mm_free_cgroup(struct mm_struct *mm);
>  
> -#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
> -
> -extern struct page_cgroup *page_get_page_cgroup(struct page *page);
>  extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
>  				gfp_t gfp_mask);
>  extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> @@ -51,9 +49,8 @@ int task_in_mem_cgroup(struct task_struc
>  #define mm_match_cgroup(mm, cgroup)	\
>  	((cgroup) == rcu_dereference((mm)->mem_cgroup))
>  
> -extern int mem_cgroup_prepare_migration(struct page *page);
> -extern void mem_cgroup_end_migration(struct page *page);
> -extern void mem_cgroup_page_migration(struct page *page, struct page *newpage);
> +extern int mem_cgroup_prepare_migration(struct page *, struct page *);
> +extern void mem_cgroup_end_migration(struct page *);
>  
>  /*
>   * For memory reclaim.
> @@ -82,14 +79,6 @@ static inline void mm_free_cgroup(struct
>  {
>  }
>  
> -static inline void page_reset_bad_cgroup(struct page *page)
> -{
> -}
> -
> -static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
> -{
> -	return NULL;
> -}
>  
>  static inline int mem_cgroup_charge(struct page *page,
>  					struct mm_struct *mm, gfp_t gfp_mask)
> @@ -122,7 +111,8 @@ static inline int task_in_mem_cgroup(str
>  	return 1;
>  }
>  
> -static inline int mem_cgroup_prepare_migration(struct page *page)
> +static inline int
> +mem_cgroup_prepare_migration(struct page *page , struct page *newpage)
>  {
>  	return 0;
>  }
> Index: linux-2.6.25-rc5-mm1-k/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.25-rc5-mm1-k.orig/mm/page_alloc.c
> +++ linux-2.6.25-rc5-mm1-k/mm/page_alloc.c
> @@ -222,17 +222,11 @@ static inline int bad_range(struct zone 
>  
>  static void bad_page(struct page *page)
>  {
> -	void *pc = page_get_page_cgroup(page);
> -
>  	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
>  		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
>  		current->comm, page, (int)(2*sizeof(unsigned long)),
>  		(unsigned long)page->flags, page->mapping,
>  		page_mapcount(page), page_count(page));
> -	if (pc) {
> -		printk(KERN_EMERG "cgroup:%p\n", pc);
> -		page_reset_bad_cgroup(page);
> -	}
>  	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
>  		KERN_EMERG "Backtrace:\n");
>  	dump_stack();
> @@ -478,7 +472,6 @@ static inline int free_pages_check(struc
>  {
>  	if (unlikely(page_mapcount(page) |
>  		(page->mapping != NULL)  |
> -		(page_get_page_cgroup(page) != NULL) |
>  		(page_count(page) != 0)  |
>  		(page->flags & (
>  			1 << PG_lru	|
> @@ -628,7 +621,6 @@ static int prep_new_page(struct page *pa
>  {
>  	if (unlikely(page_mapcount(page) |
>  		(page->mapping != NULL)  |
> -		(page_get_page_cgroup(page) != NULL) |
>  		(page_count(page) != 0)  |
>  		(page->flags & (
>  			1 << PG_lru	|
> Index: linux-2.6.25-rc5-mm1-k/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.25-rc5-mm1-k.orig/include/linux/mm_types.h
> +++ linux-2.6.25-rc5-mm1-k/include/linux/mm_types.h
> @@ -88,9 +88,6 @@ struct page {
>  	void *virtual;			/* Kernel virtual address (NULL if
>  					   not kmapped, ie. highmem) */
>  #endif /* WANT_PAGE_VIRTUAL */
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> -	unsigned long page_cgroup;
> -#endif
>  #ifdef CONFIG_PAGE_OWNER
>  	int order;
>  	unsigned int gfp_mask;
> Index: linux-2.6.25-rc5-mm1-k/mm/migrate.c
> ===================================================================
> --- linux-2.6.25-rc5-mm1-k.orig/mm/migrate.c
> +++ linux-2.6.25-rc5-mm1-k/mm/migrate.c
> @@ -358,6 +358,12 @@ static int migrate_page_move_mapping(str
>  
>  	write_unlock_irq(&mapping->tree_lock);
>  
> +	/* by mem_cgroup_prepare_migration, newpage is already
> +	   assigned to valid cgroup. and current->mm and GFP_ATOMIC
> +	   will not be used...*/
> +	mem_cgroup_uncharge_page(page);
> +	mem_cgroup_cache_charge(newpage, current->mm, GFP_ATOMIC);
> +
>  	return 0;
>  }
>  
> @@ -603,7 +609,6 @@ static int move_to_new_page(struct page 
>  		rc = fallback_migrate_page(mapping, newpage, page);
>  
>  	if (!rc) {
> -		mem_cgroup_page_migration(page, newpage);
>  		remove_migration_ptes(page, newpage);
>  	} else
>  		newpage->mapping = NULL;
> @@ -633,6 +638,12 @@ static int unmap_and_move(new_page_t get
>  		/* page was freed from under us. So we are done. */
>  		goto move_newpage;
>  
> +	charge = mem_cgroup_prepare_migration(page, newpage);
> +	if (charge == -ENOMEM) {
> +		rc = -ENOMEM;
> +		goto move_newpage;
> +	}
> +
>  	rc = -EAGAIN;
>  	if (TestSetPageLocked(page)) {
>  		if (!force)
> @@ -684,19 +695,14 @@ static int unmap_and_move(new_page_t get
>  		goto rcu_unlock;
>  	}
>  
> -	charge = mem_cgroup_prepare_migration(page);
>  	/* Establish migration ptes or remove ptes */
>  	try_to_unmap(page, 1);
>  
>  	if (!page_mapped(page))
>  		rc = move_to_new_page(newpage, page);
>  
> -	if (rc) {
> +	if (rc)
>  		remove_migration_ptes(page, page);
> -		if (charge)
> -			mem_cgroup_end_migration(page);
> -	} else if (charge)
> - 		mem_cgroup_end_migration(newpage);
>  rcu_unlock:
>  	if (rcu_locked)
>  		rcu_read_unlock();
> @@ -717,6 +723,8 @@ unlock:
>  	}
>  
>  move_newpage:
> +	if (!charge)
> +		mem_cgroup_end_migration(newpage);
>  	/*
>  	 * Move the new page to the LRU. If migration was not successful
>  	 * then this will free the page.
> Index: linux-2.6.25-rc5-mm1-k/mm/page_cgroup.c
> ===================================================================
> --- /dev/null
> +++ linux-2.6.25-rc5-mm1-k/mm/page_cgroup.c
> @@ -0,0 +1,260 @@
> +/*
> + * per-page accounting subsystem infrastructure. - linux/mm/page_cgroup.c
> + *
> + * (C) 2008 FUJITSU, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * page_cgroup is yet another mem_map under memory resoruce controller.
> + * It containes information which cannot be stored in usual mem_map.
> + * This allows us to keep 'struct page' small when a user doesn't activate
> + * memory resource controller.
> + *
> + * We can translate : struct page <-> pfn -> page_cgroup -> struct page.
> + *
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <linux/radix-tree.h>
> +#include <linux/memcontrol.h>
> +#include <linux/err.h>
> +
> +static int page_cgroup_order __read_mostly;
> +static int page_cgroup_head_size __read_mostly;
> +
> +#define PCGRP_SHIFT	(page_cgroup_order)
> +#define PCGRP_SIZE	(1 << PCGRP_SHIFT)
> +#define PCGRP_MASK	(PCGRP_SIZE - 1)
> +
> +struct page_cgroup_head {
> +	struct page_cgroup pc[0];
> +};
> +
> +struct page_cgroup_root {
> +	spinlock_t	       tree_lock;
> +	struct radix_tree_root root_node;
> +};
> +
> +/*
> + * Calculate page_cgroup order to be not larger than order-2 page allocation.
> + */
> +static void calc_page_cgroup_order(void)
> +{
> +	int order = pageblock_order;
> +	unsigned long size = sizeof(struct page_cgroup) << order;
> +
> +	while (size > PAGE_SIZE * 2) {
> +		order -= 1;
> +		size = sizeof(struct page_cgroup) << order;
> +	}
> +
> +	page_cgroup_order = order;
> +	page_cgroup_head_size = sizeof(struct page_cgroup_head) +
> +				(sizeof(struct page_cgroup) << order);
> +}
> +
> +static struct page_cgroup_root __initdata *tmp_root_dir[MAX_NUMNODES];
> +static struct page_cgroup_root *root_node[MAX_NUMNODES] __read_mostly;
> +
> +static void
> +init_page_cgroup_head(struct page_cgroup_head *head, unsigned long pfn)
> +{
> +	struct page *page;
> +	struct page_cgroup *pc;
> +	int i;
> +
> +	for (i = 0, page = pfn_to_page(pfn), pc = &head->pc[0];
> +		i < PCGRP_SIZE; i++, page++, pc++) {

indentation:
	     i < PCGRP_SIZE; i++, page++, pc++) {

> +		pc->refcnt = 0;
> +		pc->page = page;
> +		spin_lock_init(&pc->lock);
> +	}
> +}
> +
> +
> +struct kmem_cache *page_cgroup_cachep;
> +
> +static struct page_cgroup_head *
> +alloc_page_cgroup_head(unsigned long pfn, int nid, gfp_t mask)
> +{
> +	struct page_cgroup_head *head;
> +
> +	if (!node_state(nid, N_NORMAL_MEMORY))
> +		nid = -1;
> +	head = kmem_cache_alloc_node(page_cgroup_cachep, mask, nid);
> +	if (head)
> +		init_page_cgroup_head(head, pfn);
> +
> +	return head;
> +}
> +
> +void free_page_cgroup(struct page_cgroup_head *head)
> +{
> +	kmem_cache_free(page_cgroup_cachep, head);
> +}
> +
> +static struct page_cgroup_root *pcgroup_get_root(struct page *page)
> +{
> +	int nid;
> +
> +	VM_BUG_ON(!page);
> +
> +	nid = page_to_nid(page);
> +
> +	return root_node[nid];
> +}
> +
> +/**
> + * get_page_cgroup - look up a page_cgroup for a page
> + * @page: the page whose page_cgroup is looked up.
> + *
> + * This just does lookup.
> + */
> +struct page_cgroup *get_page_cgroup(struct page *page)
> +{
> +	struct page_cgroup_head *head;
> +	struct page_cgroup_root *root;
> +	struct page_cgroup *ret = NULL;
> +	unsigned long pfn, idx;
> +
> +	/*
> +	 * NULL can be returned before initialization
> +	 */
> +	root = pcgroup_get_root(page);
> +	if (unlikely(!root))
> +		return ret;
> +
> +	pfn = page_to_pfn(page);
> +	idx = pfn >> PCGRP_SHIFT;
> +	/*
> +	 * We don't need lock here because no one deletes this head.
> +	 * (Freeing routtine will be added later.)
> +	 */
> +	rcu_read_lock();
> +	head = radix_tree_lookup(&root->root_node, idx);
> +	rcu_read_unlock();
> +
> +	if (likely(head))
> +		ret = &head->pc[pfn & PCGRP_MASK];
> +
> +	return ret;
> +}
> +
> +/**
> + * get_alloc_page_cgroup - look up or allocate a page_cgroup for a page
> + * @page: the page whose page_cgroup is looked up.
> + * @gfpmask: the gfpmask which will be used for page allocatiopn.
> + *
> + * look up and allocate if not found.
> + */
> +
> +struct page_cgroup *
> +get_alloc_page_cgroup(struct page *page, gfp_t gfpmask)
> +{
> +	struct page_cgroup_root *root;
> +	struct page_cgroup_head *head;
> +	struct page_cgroup *pc;
> +	unsigned long pfn, idx;
> +	int nid;
> +	unsigned long base_pfn, flags;
> +	int error = 0;
> +
> +	might_sleep_if(gfpmask & __GFP_WAIT);
> +
> +retry:
> +	pc = get_page_cgroup(page);
> +	if (pc)
> +		return pc;
> +	/*
> +	 * NULL can be returned before initialization.
> +	 */
> +	root = pcgroup_get_root(page);
> +	if (unlikely(!root))
> +		return NULL;
> +
> +	pfn = page_to_pfn(page);
> +	idx = pfn >> PCGRP_SHIFT;
> +	nid = page_to_nid(page);
> +	base_pfn = idx << PCGRP_SHIFT;
> +
> +	gfpmask = gfpmask & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
> +
> +	head = alloc_page_cgroup_head(base_pfn, nid, gfpmask);
> +	if (!head)
> +		return ERR_PTR(-ENOMEM);
> +
> +	pc = &head->pc[pfn & PCGRP_MASK];
> +
> +	error = radix_tree_preload(gfpmask);
> +	if (error)
> +		goto out;
> +	spin_lock_irqsave(&root->tree_lock, flags);
> +	error = radix_tree_insert(&root->root_node, idx, head);
> +	spin_unlock_irqrestore(&root->tree_lock, flags);
> +	radix_tree_preload_end();
> +out:
> +	if (error) {
> +		free_page_cgroup(head);
> +		if (error == -EEXIST)
> +			goto retry;
> +		pc = ERR_PTR(error);
> +	}
> +	return pc;
> +}
> +
> +__init int page_cgroup_init(void)

static int __init

> +{
> +	int tmp, nid;
> +	struct page_cgroup_root *root;
> +
> +	calc_page_cgroup_order();
> +
> +	page_cgroup_cachep = kmem_cache_create("page_cgroup",
> +				page_cgroup_head_size, 0,
> +				SLAB_PANIC | SLAB_DESTROY_BY_RCU, NULL);
> +
> +	if (!page_cgroup_cachep) {
> +		printk(KERN_ERR "page accouning setup failure\n");
> +		printk(KERN_ERR "can't initialize slab memory\n");

why not:

printk(KERN_ERR "page accouning setup failure, can't initialize slab memory\n");

otherwise the user may think 2 bad things happened

> +		return -ENOMEM;
> +	}
> +
> +	for_each_node(nid) {
> +		tmp = nid;
> +		if (!node_state(nid, N_NORMAL_MEMORY))
> +			tmp = -1;
> +		root = kmalloc_node(sizeof(struct page_cgroup_root),
> +					GFP_KERNEL, tmp);
> +		if (!root)
> +			goto unroll;
> +		INIT_RADIX_TREE(&root->root_node, GFP_ATOMIC);
> +		spin_lock_init(&root->tree_lock);
> +		tmp_root_dir[nid] = root;
> +	}
> +	/*
> +	 * By filling node_root[], this tree turns to be visible.
> +	 * Because we have to finish initialization of the tree before
> +	 * we make it visible, memory barrier is necessary.
> +	 */
> +	smp_wmb();
> +	for_each_node(nid)
> +		root_node[nid] = tmp_root_dir[nid];
> +
> +	printk(KERN_INFO "Page Accouintg is activated\n");

			       Accounting

> +	return 0;
> +unroll:
> +	for_each_node(nid)
> +		kfree(tmp_root_dir[nid]);
> +
> +	return -ENOMEM;
> +}
> +late_initcall(page_cgroup_init);
> Index: linux-2.6.25-rc5-mm1-k/mm/Makefile
> ===================================================================
> --- linux-2.6.25-rc5-mm1-k.orig/mm/Makefile
> +++ linux-2.6.25-rc5-mm1-k/mm/Makefile
> @@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
>  obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>  
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>