From mboxrd@z Thu Jan 1 00:00:00 1970 Message-ID: <47EC41E5.7090901@cn.fujitsu.com> Date: Fri, 28 Mar 2008 08:55:01 +0800 From: Li Zefan MIME-Version: 1.0 Subject: Re: [-mm] [PATCH 1/4] memcg : radix-tree page_cgroup References: <20080327174435.e69f5b45.kamezawa.hiroyu@jp.fujitsu.com> <20080327174757.6ea57ea4.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20080327174757.6ea57ea4.kamezawa.hiroyu@jp.fujitsu.com> Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Return-Path: To: KAMEZAWA Hiroyuki Cc: "linux-mm@kvack.org" , "balbir@linux.vnet.ibm.com" , a.p.zijlstra@chello.nl List-ID: KAMEZAWA Hiroyuki wrote: > This patch implements radixt-tree based page cgroup. > > This patch does > * add radix-tree based page_cgroup look up subsystem.page_cgroup_init > * remove bit_spin_lock used by page_cgroup. > > changes: > > Before patch > * struct page had pointer to page_cgroup. Then, relationship between objects > was pfn <-> struct page <-> struct page_cgroup > * (spin) lock for page_cgroup was in struct page. > * page_cgroup->refcnt is incremented before charge is done. > * page migration does complicated page_cgroup migration under locks. > > After patch > * struct page has no pointer to page_cgroup. Relationship between objects > is struct page <-> pfn <-> struct page_cgroup -> struct page, > * page_cgroup has its own spin lock. > * page_cgroup->refcnt is incremented after charge is done. > * page migration accounts a new page before migration. By this, we can > avoid complicated locks. > > tested on ia64/NUMA, x86_64/SMP. > > Changelog v1 -> v2: > * create a folded patch. maybe good for bysect. > * removed special handling codes for new pages under migration > Added PG_LRU check to force_empty. > * reflected comments. > * Added comments in the head of page_cgroup.c > * order of page_cgroup is automatically calculated. > * fixed handling of root_node[] entries in page_cgroup_init(). > * rewrite init_page_cgroup_head() to do minimum work. > * fixed N_NORMAL_MEMORY handling. > > Signed-off-by: KAMEZAWA Hiroyuki > At a first glance, some typos and small things, see below ;) > > include/linux/memcontrol.h | 17 -- > include/linux/mm_types.h | 3 > include/linux/page_cgroup.h | 56 +++++++ > mm/Makefile | 2 > mm/memcontrol.c | 332 ++++++++++++++++---------------------------- > mm/migrate.c | 22 +- > mm/page_alloc.c | 8 - > mm/page_cgroup.c | 260 ++++++++++++++++++++++++++++++++++ > 8 files changed, 463 insertions(+), 237 deletions(-) > > Index: linux-2.6.25-rc5-mm1-k/include/linux/page_cgroup.h > =================================================================== > --- /dev/null > +++ linux-2.6.25-rc5-mm1-k/include/linux/page_cgroup.h > @@ -0,0 +1,56 @@ > +#ifndef __LINUX_PAGE_CGROUP_H > +#define __LINUX_PAGE_CGROUP_H > + > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR > +/* > + * page_cgroup is yet another mem_map structure for accounting usage. Not fix yes: extra space ^^ > + * but, unlike mem_map, allocated on demand for accounted pages. > + * see also memcontrol.h > + * In nature, this consumes much amount of memory. > + */ > + > +struct mem_cgroup; > + > +struct page_cgroup { > + spinlock_t lock; /* lock for all members */ > + int refcnt; /* reference count */ > + struct mem_cgroup *mem_cgroup; /* current cgroup subsys */ > + struct list_head lru; /* for per cgroup LRU */ > + int flags; /* See below */ > + struct page *page; /* the page this accounts for*/ > +}; > + > +/* flags */ > +#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache. */ > +#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* is on active list */ > + > +/* > + * look up page_cgroup. returns NULL if not exists. > + */ > +extern struct page_cgroup *get_page_cgroup(struct page *page); > + > + > +/* > + * look up page_cgroup, allocate new one if it doesn't exist. > + * Return value is > + * 1. page_cgroup, at success. > + * 2. -EXXXXX, at failure. > + * 3. NULL, at boot. > + */ > +extern struct page_cgroup * > +get_alloc_page_cgroup(struct page *page, gfp_t gfpmask); > + > +#else > + > +static inline struct page_cgroup *get_page_cgroup(struct page *page) > +{ > + return NULL; > +} > + > +static inline struct page_cgroup * > +get_alloc_page_cgroup(struct page *page, gfp_t gfpmask) > +{ > + return NULL; > +} > +#endif > +#endif > Index: linux-2.6.25-rc5-mm1-k/mm/memcontrol.c > =================================================================== > --- linux-2.6.25-rc5-mm1-k.orig/mm/memcontrol.c > +++ linux-2.6.25-rc5-mm1-k/mm/memcontrol.c > @@ -30,6 +30,7 @@ > #include > #include > #include > +#include > > #include > > @@ -92,7 +93,7 @@ struct mem_cgroup_per_zone { > /* > * spin_lock to protect the per cgroup LRU > */ > - spinlock_t lru_lock; > + spinlock_t lru_lock; /* irq should be off. */ > struct list_head active_list; > struct list_head inactive_list; > unsigned long count[NR_MEM_CGROUP_ZSTAT]; > @@ -139,33 +140,6 @@ struct mem_cgroup { > }; > static struct mem_cgroup init_mem_cgroup; > > -/* > - * We use the lower bit of the page->page_cgroup pointer as a bit spin > - * lock. We need to ensure that page->page_cgroup is at least two > - * byte aligned (based on comments from Nick Piggin). But since > - * bit_spin_lock doesn't actually set that lock bit in a non-debug > - * uniprocessor kernel, we should avoid setting it here too. > - */ > -#define PAGE_CGROUP_LOCK_BIT 0x0 > -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) > -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) > -#else > -#define PAGE_CGROUP_LOCK 0x0 > -#endif > - > -/* > - * A page_cgroup page is associated with every page descriptor. The > - * page_cgroup helps us identify information about the cgroup > - */ > -struct page_cgroup { > - struct list_head lru; /* per cgroup LRU list */ > - struct page *page; > - struct mem_cgroup *mem_cgroup; > - int ref_cnt; /* cached, mapped, migrating */ > - int flags; > -}; > -#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ > -#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ > > static int page_cgroup_nid(struct page_cgroup *pc) > { > @@ -256,37 +230,6 @@ void mm_free_cgroup(struct mm_struct *mm > css_put(&mm->mem_cgroup->css); > } > > -static inline int page_cgroup_locked(struct page *page) > -{ > - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) > -{ > - VM_BUG_ON(!page_cgroup_locked(page)); > - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); > -} > - > -struct page_cgroup *page_get_page_cgroup(struct page *page) > -{ > - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); > -} > - > -static void lock_page_cgroup(struct page *page) > -{ > - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > -static int try_lock_page_cgroup(struct page *page) > -{ > - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > -static void unlock_page_cgroup(struct page *page) > -{ > - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); > -} > - > static void __mem_cgroup_remove_list(struct page_cgroup *pc) > { > int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; > @@ -356,6 +299,10 @@ void mem_cgroup_move_lists(struct page * > struct mem_cgroup_per_zone *mz; > unsigned long flags; > > + /* This GFP will be ignored..*/ > + pc = get_page_cgroup(page); > + if (!pc) > + return; > /* > * We cannot lock_page_cgroup while holding zone's lru_lock, > * because other holders of lock_page_cgroup can be interrupted > @@ -363,17 +310,15 @@ void mem_cgroup_move_lists(struct page * > * safely get to page_cgroup without it, so just try_lock it: > * mem_cgroup_isolate_pages allows for page left on wrong list. > */ > - if (!try_lock_page_cgroup(page)) > + if (!spin_trylock_irqsave(&pc->lock, flags)) > return; > - > - pc = page_get_page_cgroup(page); > - if (pc) { > + if (pc->refcnt) { > mz = page_cgroup_zoneinfo(pc); > - spin_lock_irqsave(&mz->lru_lock, flags); > + spin_lock(&mz->lru_lock); > __mem_cgroup_move_lists(pc, active); > - spin_unlock_irqrestore(&mz->lru_lock, flags); > + spin_unlock(&mz->lru_lock); > } > - unlock_page_cgroup(page); > + spin_unlock_irqrestore(&pc->lock, flags); > } > > /* > @@ -525,7 +470,8 @@ unsigned long mem_cgroup_isolate_pages(u > * < 0 if the cgroup is over its limit > */ > static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, > - gfp_t gfp_mask, enum charge_type ctype) > + gfp_t gfp_mask, enum charge_type ctype, > + struct mem_cgroup *memcg) > { > struct mem_cgroup *mem; > struct page_cgroup *pc; > @@ -536,33 +482,23 @@ static int mem_cgroup_charge_common(stru > if (mem_cgroup_subsys.disabled) > return 0; > > + pc = get_alloc_page_cgroup(page, gfp_mask); > + /* Before kamalloc initialization, get_page_cgroup can retrun NULL */ kmalloc return > + if (unlikely(!pc || IS_ERR(pc))) > + return PTR_ERR(pc); > + > + spin_lock_irqsave(&pc->lock, flags); > /* > - * Should page_cgroup's go to their own slab? > - * One could optimize the performance of the charging routine > - * by saving a bit in the page_flags and using it as a lock > - * to see if the cgroup page already has a page_cgroup associated > - * with it > - */ > -retry: > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - /* > - * The page_cgroup exists and > - * the page has already been accounted. > + * Has the page already been accounted ? > */ > - if (pc) { > - VM_BUG_ON(pc->page != page); > - VM_BUG_ON(pc->ref_cnt <= 0); > - > - pc->ref_cnt++; > - unlock_page_cgroup(page); > - goto done; > + if (pc->refcnt > 0) { > + pc->refcnt++; > + spin_unlock_irqrestore(&pc->lock, flags); > + goto success; > } > - unlock_page_cgroup(page); > + spin_unlock_irqrestore(&pc->lock, flags); > > - pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); > - if (pc == NULL) > - goto err; > + /* Note: *new* pc's refcnt is still 0 here. */ > > /* > * We always charge the cgroup the mm_struct belongs to. > @@ -570,20 +506,24 @@ retry: > * thread group leader migrates. It's possible that mm is not > * set, if so charge the init_mm (happens for pagecache usage). > */ > - if (!mm) > - mm = &init_mm; > - > - rcu_read_lock(); > - mem = rcu_dereference(mm->mem_cgroup); > - /* > - * For every charge from the cgroup, increment reference count > - */ > - css_get(&mem->css); > - rcu_read_unlock(); > + if (memcg) { > + mem = memcg; > + css_get(&mem->css); > + } else { > + if (!mm) > + mm = &init_mm; > + rcu_read_lock(); > + mem = rcu_dereference(mm->mem_cgroup); > + /* > + * For every charge from the cgroup, increment reference count > + */ > + css_get(&mem->css); > + rcu_read_unlock(); > + } > > while (res_counter_charge(&mem->res, PAGE_SIZE)) { > if (!(gfp_mask & __GFP_WAIT)) > - goto out; > + goto nomem; > > if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) > continue; > @@ -600,52 +540,51 @@ retry: > > if (!nr_retries--) { > mem_cgroup_out_of_memory(mem, gfp_mask); > - goto out; > + goto nomem; > } > congestion_wait(WRITE, HZ/10); > } > - > - pc->ref_cnt = 1; > - pc->mem_cgroup = mem; > - pc->page = page; > - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; > - if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) > - pc->flags |= PAGE_CGROUP_FLAG_CACHE; > - > - lock_page_cgroup(page); > - if (page_get_page_cgroup(page)) { > - unlock_page_cgroup(page); > - /* > - * Another charge has been added to this page already. > - * We take lock_page_cgroup(page) again and read > - * page->cgroup, increment refcnt.... just retry is OK. > - */ > + /* > + * We have to acquire 2 spinlocks. > + */ > + spin_lock_irqsave(&pc->lock, flags); > + /* Is anyone charged ? */ > + if (unlikely(pc->refcnt)) { > + /* Someone charged this page while we released the lock */ > + pc->refcnt++; > + spin_unlock_irqrestore(&pc->lock, flags); > res_counter_uncharge(&mem->res, PAGE_SIZE); > css_put(&mem->css); > - kfree(pc); > - goto retry; > + goto success; > } > - page_assign_page_cgroup(page, pc); > + /* Anyone doesn't touch this. */ > + VM_BUG_ON(pc->mem_cgroup); > + > + if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) > + pc->flags = PAGE_CGROUP_FLAG_ACTIVE | PAGE_CGROUP_FLAG_CACHE; > + else > + pc->flags = PAGE_CGROUP_FLAG_ACTIVE; > + pc->refcnt = 1; > + pc->mem_cgroup = mem; > > mz = page_cgroup_zoneinfo(pc); > - spin_lock_irqsave(&mz->lru_lock, flags); > + > + spin_lock(&mz->lru_lock); > __mem_cgroup_add_list(pc); > - spin_unlock_irqrestore(&mz->lru_lock, flags); > + spin_unlock(&mz->lru_lock); > + spin_unlock_irqrestore(&pc->lock, flags); > > - unlock_page_cgroup(page); > -done: > +success: > return 0; > -out: > +nomem: > css_put(&mem->css); > - kfree(pc); > -err: > return -ENOMEM; > } > > int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) > { > return mem_cgroup_charge_common(page, mm, gfp_mask, > - MEM_CGROUP_CHARGE_TYPE_MAPPED); > + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); > } > > int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, > @@ -654,7 +593,7 @@ int mem_cgroup_cache_charge(struct page > if (!mm) > mm = &init_mm; > return mem_cgroup_charge_common(page, mm, gfp_mask, > - MEM_CGROUP_CHARGE_TYPE_CACHE); > + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); > } > > /* > @@ -664,105 +603,87 @@ int mem_cgroup_cache_charge(struct page > void mem_cgroup_uncharge_page(struct page *page) > { > struct page_cgroup *pc; > - struct mem_cgroup *mem; > - struct mem_cgroup_per_zone *mz; > - unsigned long flags; > > if (mem_cgroup_subsys.disabled) > return; > - > /* > * Check if our page_cgroup is valid > */ > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - if (!pc) > - goto unlock; > - > - VM_BUG_ON(pc->page != page); > - VM_BUG_ON(pc->ref_cnt <= 0); > + pc = get_page_cgroup(page); > + if (likely(pc)) { > + unsigned long flags; > + struct mem_cgroup *mem; > + struct mem_cgroup_per_zone *mz; > > - if (--(pc->ref_cnt) == 0) { > + spin_lock_irqsave(&pc->lock, flags); > + if (!pc->refcnt || --pc->refcnt > 0) { > + spin_unlock_irqrestore(&pc->lock, flags); > + return; > + } > + mem = pc->mem_cgroup; > mz = page_cgroup_zoneinfo(pc); > - spin_lock_irqsave(&mz->lru_lock, flags); > + spin_lock(&mz->lru_lock); > __mem_cgroup_remove_list(pc); > - spin_unlock_irqrestore(&mz->lru_lock, flags); > - > - page_assign_page_cgroup(page, NULL); > - unlock_page_cgroup(page); > + spin_unlock(&mz->lru_lock); > + pc->flags = 0; > + pc->mem_cgroup = 0; > + spin_unlock_irqrestore(&pc->lock, flags); > > - mem = pc->mem_cgroup; > res_counter_uncharge(&mem->res, PAGE_SIZE); > css_put(&mem->css); > - > - kfree(pc); > - return; > } > - > -unlock: > - unlock_page_cgroup(page); > } > > /* > - * Returns non-zero if a page (under migration) has valid page_cgroup member. > - * Refcnt of page_cgroup is incremented. > + * Pre-charge against newpage while moving a page. > + * This function is called before taking page locks. > */ > -int mem_cgroup_prepare_migration(struct page *page) > +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) > { > struct page_cgroup *pc; > + struct mem_cgroup *mem = NULL; > + int ret = 0; > + enum charge_type type = 0; don't initialize an enum variable with integer value. > + unsigned long flags; > > if (mem_cgroup_subsys.disabled) > - return 0; > + return ret; > > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - if (pc) > - pc->ref_cnt++; > - unlock_page_cgroup(page); > - return pc != NULL; > -} > + /* check newpage isn't under memory resource control */ > + pc = get_page_cgroup(newpage); > + VM_BUG_ON(pc && pc->refcnt); > > -void mem_cgroup_end_migration(struct page *page) > -{ > - mem_cgroup_uncharge_page(page); > -} > + pc = get_page_cgroup(page); > > + if (pc) { > + spin_lock_irqsave(&pc->lock, flags); > + if (pc->refcnt) { > + mem = pc->mem_cgroup; > + css_get(&mem->css); > + if (pc->flags & PAGE_CGROUP_FLAG_CACHE) > + type = MEM_CGROUP_CHARGE_TYPE_CACHE; > + else > + type = MEM_CGROUP_CHARGE_TYPE_MAPPED; > + } > + spin_unlock_irqrestore(&pc->lock, flags); > + if (mem) { > + ret = mem_cgroup_charge_common(newpage, NULL, > + GFP_KERNEL, type, mem); > + css_put(&mem->css); > + } > + } > + return ret; > +} > /* > - * We know both *page* and *newpage* are now not-on-LRU and PG_locked. > - * And no race with uncharge() routines because page_cgroup for *page* > - * has extra one reference by mem_cgroup_prepare_migration. > + * At the end of migration, we'll push newpage to LRU and > + * drop one refcnt which added at prepare_migration. > */ > -void mem_cgroup_page_migration(struct page *page, struct page *newpage) > +void mem_cgroup_end_migration(struct page *newpage) > { > - struct page_cgroup *pc; > - struct mem_cgroup_per_zone *mz; > - unsigned long flags; > - > - lock_page_cgroup(page); > - pc = page_get_page_cgroup(page); > - if (!pc) { > - unlock_page_cgroup(page); > + if (mem_cgroup_subsys.disabled) > return; > - } > > - mz = page_cgroup_zoneinfo(pc); > - spin_lock_irqsave(&mz->lru_lock, flags); > - __mem_cgroup_remove_list(pc); > - spin_unlock_irqrestore(&mz->lru_lock, flags); > - > - page_assign_page_cgroup(page, NULL); > - unlock_page_cgroup(page); > - > - pc->page = newpage; > - lock_page_cgroup(newpage); > - page_assign_page_cgroup(newpage, pc); > - > - mz = page_cgroup_zoneinfo(pc); > - spin_lock_irqsave(&mz->lru_lock, flags); > - __mem_cgroup_add_list(pc); > - spin_unlock_irqrestore(&mz->lru_lock, flags); > - > - unlock_page_cgroup(newpage); > + mem_cgroup_uncharge_page(newpage); > } > > /* > @@ -790,10 +711,13 @@ static void mem_cgroup_force_empty_list( > while (!list_empty(list)) { > pc = list_entry(list->prev, struct page_cgroup, lru); > page = pc->page; > - get_page(page); > - spin_unlock_irqrestore(&mz->lru_lock, flags); > - mem_cgroup_uncharge_page(page); > - put_page(page); > + if (PageLRU(page)) { > + get_page(page); > + spin_unlock_irqrestore(&mz->lru_lock, flags); > + mem_cgroup_uncharge_page(page); > + put_page(page); > + } else > + count = 0; > if (--count <= 0) { > count = FORCE_UNCHARGE_BATCH; > cond_resched(); > Index: linux-2.6.25-rc5-mm1-k/include/linux/memcontrol.h > =================================================================== > --- linux-2.6.25-rc5-mm1-k.orig/include/linux/memcontrol.h > +++ linux-2.6.25-rc5-mm1-k/include/linux/memcontrol.h > @@ -19,6 +19,7 @@ > > #ifndef _LINUX_MEMCONTROL_H > #define _LINUX_MEMCONTROL_H > +#include > > struct mem_cgroup; > struct page_cgroup; > @@ -30,9 +31,6 @@ struct mm_struct; > extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p); > extern void mm_free_cgroup(struct mm_struct *mm); > > -#define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0) > - > -extern struct page_cgroup *page_get_page_cgroup(struct page *page); > extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, > gfp_t gfp_mask); > extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, > @@ -51,9 +49,8 @@ int task_in_mem_cgroup(struct task_struc > #define mm_match_cgroup(mm, cgroup) \ > ((cgroup) == rcu_dereference((mm)->mem_cgroup)) > > -extern int mem_cgroup_prepare_migration(struct page *page); > -extern void mem_cgroup_end_migration(struct page *page); > -extern void mem_cgroup_page_migration(struct page *page, struct page *newpage); > +extern int mem_cgroup_prepare_migration(struct page *, struct page *); > +extern void mem_cgroup_end_migration(struct page *); > > /* > * For memory reclaim. > @@ -82,14 +79,6 @@ static inline void mm_free_cgroup(struct > { > } > > -static inline void page_reset_bad_cgroup(struct page *page) > -{ > -} > - > -static inline struct page_cgroup *page_get_page_cgroup(struct page *page) > -{ > - return NULL; > -} > > static inline int mem_cgroup_charge(struct page *page, > struct mm_struct *mm, gfp_t gfp_mask) > @@ -122,7 +111,8 @@ static inline int task_in_mem_cgroup(str > return 1; > } > > -static inline int mem_cgroup_prepare_migration(struct page *page) > +static inline int > +mem_cgroup_prepare_migration(struct page *page , struct page *newpage) > { > return 0; > } > Index: linux-2.6.25-rc5-mm1-k/mm/page_alloc.c > =================================================================== > --- linux-2.6.25-rc5-mm1-k.orig/mm/page_alloc.c > +++ linux-2.6.25-rc5-mm1-k/mm/page_alloc.c > @@ -222,17 +222,11 @@ static inline int bad_range(struct zone > > static void bad_page(struct page *page) > { > - void *pc = page_get_page_cgroup(page); > - > printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG > "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", > current->comm, page, (int)(2*sizeof(unsigned long)), > (unsigned long)page->flags, page->mapping, > page_mapcount(page), page_count(page)); > - if (pc) { > - printk(KERN_EMERG "cgroup:%p\n", pc); > - page_reset_bad_cgroup(page); > - } > printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" > KERN_EMERG "Backtrace:\n"); > dump_stack(); > @@ -478,7 +472,6 @@ static inline int free_pages_check(struc > { > if (unlikely(page_mapcount(page) | > (page->mapping != NULL) | > - (page_get_page_cgroup(page) != NULL) | > (page_count(page) != 0) | > (page->flags & ( > 1 << PG_lru | > @@ -628,7 +621,6 @@ static int prep_new_page(struct page *pa > { > if (unlikely(page_mapcount(page) | > (page->mapping != NULL) | > - (page_get_page_cgroup(page) != NULL) | > (page_count(page) != 0) | > (page->flags & ( > 1 << PG_lru | > Index: linux-2.6.25-rc5-mm1-k/include/linux/mm_types.h > =================================================================== > --- linux-2.6.25-rc5-mm1-k.orig/include/linux/mm_types.h > +++ linux-2.6.25-rc5-mm1-k/include/linux/mm_types.h > @@ -88,9 +88,6 @@ struct page { > void *virtual; /* Kernel virtual address (NULL if > not kmapped, ie. highmem) */ > #endif /* WANT_PAGE_VIRTUAL */ > -#ifdef CONFIG_CGROUP_MEM_RES_CTLR > - unsigned long page_cgroup; > -#endif > #ifdef CONFIG_PAGE_OWNER > int order; > unsigned int gfp_mask; > Index: linux-2.6.25-rc5-mm1-k/mm/migrate.c > =================================================================== > --- linux-2.6.25-rc5-mm1-k.orig/mm/migrate.c > +++ linux-2.6.25-rc5-mm1-k/mm/migrate.c > @@ -358,6 +358,12 @@ static int migrate_page_move_mapping(str > > write_unlock_irq(&mapping->tree_lock); > > + /* by mem_cgroup_prepare_migration, newpage is already > + assigned to valid cgroup. and current->mm and GFP_ATOMIC > + will not be used...*/ > + mem_cgroup_uncharge_page(page); > + mem_cgroup_cache_charge(newpage, current->mm, GFP_ATOMIC); > + > return 0; > } > > @@ -603,7 +609,6 @@ static int move_to_new_page(struct page > rc = fallback_migrate_page(mapping, newpage, page); > > if (!rc) { > - mem_cgroup_page_migration(page, newpage); > remove_migration_ptes(page, newpage); > } else > newpage->mapping = NULL; > @@ -633,6 +638,12 @@ static int unmap_and_move(new_page_t get > /* page was freed from under us. So we are done. */ > goto move_newpage; > > + charge = mem_cgroup_prepare_migration(page, newpage); > + if (charge == -ENOMEM) { > + rc = -ENOMEM; > + goto move_newpage; > + } > + > rc = -EAGAIN; > if (TestSetPageLocked(page)) { > if (!force) > @@ -684,19 +695,14 @@ static int unmap_and_move(new_page_t get > goto rcu_unlock; > } > > - charge = mem_cgroup_prepare_migration(page); > /* Establish migration ptes or remove ptes */ > try_to_unmap(page, 1); > > if (!page_mapped(page)) > rc = move_to_new_page(newpage, page); > > - if (rc) { > + if (rc) > remove_migration_ptes(page, page); > - if (charge) > - mem_cgroup_end_migration(page); > - } else if (charge) > - mem_cgroup_end_migration(newpage); > rcu_unlock: > if (rcu_locked) > rcu_read_unlock(); > @@ -717,6 +723,8 @@ unlock: > } > > move_newpage: > + if (!charge) > + mem_cgroup_end_migration(newpage); > /* > * Move the new page to the LRU. If migration was not successful > * then this will free the page. > Index: linux-2.6.25-rc5-mm1-k/mm/page_cgroup.c > =================================================================== > --- /dev/null > +++ linux-2.6.25-rc5-mm1-k/mm/page_cgroup.c > @@ -0,0 +1,260 @@ > +/* > + * per-page accounting subsystem infrastructure. - linux/mm/page_cgroup.c > + * > + * (C) 2008 FUJITSU, KAMEZAWA Hiroyuki > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * page_cgroup is yet another mem_map under memory resoruce controller. > + * It containes information which cannot be stored in usual mem_map. > + * This allows us to keep 'struct page' small when a user doesn't activate > + * memory resource controller. > + * > + * We can translate : struct page <-> pfn -> page_cgroup -> struct page. > + * > + */ > + > +#include > +#include > +#include > +#include > +#include > + > +static int page_cgroup_order __read_mostly; > +static int page_cgroup_head_size __read_mostly; > + > +#define PCGRP_SHIFT (page_cgroup_order) > +#define PCGRP_SIZE (1 << PCGRP_SHIFT) > +#define PCGRP_MASK (PCGRP_SIZE - 1) > + > +struct page_cgroup_head { > + struct page_cgroup pc[0]; > +}; > + > +struct page_cgroup_root { > + spinlock_t tree_lock; > + struct radix_tree_root root_node; > +}; > + > +/* > + * Calculate page_cgroup order to be not larger than order-2 page allocation. > + */ > +static void calc_page_cgroup_order(void) > +{ > + int order = pageblock_order; > + unsigned long size = sizeof(struct page_cgroup) << order; > + > + while (size > PAGE_SIZE * 2) { > + order -= 1; > + size = sizeof(struct page_cgroup) << order; > + } > + > + page_cgroup_order = order; > + page_cgroup_head_size = sizeof(struct page_cgroup_head) + > + (sizeof(struct page_cgroup) << order); > +} > + > +static struct page_cgroup_root __initdata *tmp_root_dir[MAX_NUMNODES]; > +static struct page_cgroup_root *root_node[MAX_NUMNODES] __read_mostly; > + > +static void > +init_page_cgroup_head(struct page_cgroup_head *head, unsigned long pfn) > +{ > + struct page *page; > + struct page_cgroup *pc; > + int i; > + > + for (i = 0, page = pfn_to_page(pfn), pc = &head->pc[0]; > + i < PCGRP_SIZE; i++, page++, pc++) { indentation: i < PCGRP_SIZE; i++, page++, pc++) { > + pc->refcnt = 0; > + pc->page = page; > + spin_lock_init(&pc->lock); > + } > +} > + > + > +struct kmem_cache *page_cgroup_cachep; > + > +static struct page_cgroup_head * > +alloc_page_cgroup_head(unsigned long pfn, int nid, gfp_t mask) > +{ > + struct page_cgroup_head *head; > + > + if (!node_state(nid, N_NORMAL_MEMORY)) > + nid = -1; > + head = kmem_cache_alloc_node(page_cgroup_cachep, mask, nid); > + if (head) > + init_page_cgroup_head(head, pfn); > + > + return head; > +} > + > +void free_page_cgroup(struct page_cgroup_head *head) > +{ > + kmem_cache_free(page_cgroup_cachep, head); > +} > + > +static struct page_cgroup_root *pcgroup_get_root(struct page *page) > +{ > + int nid; > + > + VM_BUG_ON(!page); > + > + nid = page_to_nid(page); > + > + return root_node[nid]; > +} > + > +/** > + * get_page_cgroup - look up a page_cgroup for a page > + * @page: the page whose page_cgroup is looked up. > + * > + * This just does lookup. > + */ > +struct page_cgroup *get_page_cgroup(struct page *page) > +{ > + struct page_cgroup_head *head; > + struct page_cgroup_root *root; > + struct page_cgroup *ret = NULL; > + unsigned long pfn, idx; > + > + /* > + * NULL can be returned before initialization > + */ > + root = pcgroup_get_root(page); > + if (unlikely(!root)) > + return ret; > + > + pfn = page_to_pfn(page); > + idx = pfn >> PCGRP_SHIFT; > + /* > + * We don't need lock here because no one deletes this head. > + * (Freeing routtine will be added later.) > + */ > + rcu_read_lock(); > + head = radix_tree_lookup(&root->root_node, idx); > + rcu_read_unlock(); > + > + if (likely(head)) > + ret = &head->pc[pfn & PCGRP_MASK]; > + > + return ret; > +} > + > +/** > + * get_alloc_page_cgroup - look up or allocate a page_cgroup for a page > + * @page: the page whose page_cgroup is looked up. > + * @gfpmask: the gfpmask which will be used for page allocatiopn. > + * > + * look up and allocate if not found. > + */ > + > +struct page_cgroup * > +get_alloc_page_cgroup(struct page *page, gfp_t gfpmask) > +{ > + struct page_cgroup_root *root; > + struct page_cgroup_head *head; > + struct page_cgroup *pc; > + unsigned long pfn, idx; > + int nid; > + unsigned long base_pfn, flags; > + int error = 0; > + > + might_sleep_if(gfpmask & __GFP_WAIT); > + > +retry: > + pc = get_page_cgroup(page); > + if (pc) > + return pc; > + /* > + * NULL can be returned before initialization. > + */ > + root = pcgroup_get_root(page); > + if (unlikely(!root)) > + return NULL; > + > + pfn = page_to_pfn(page); > + idx = pfn >> PCGRP_SHIFT; > + nid = page_to_nid(page); > + base_pfn = idx << PCGRP_SHIFT; > + > + gfpmask = gfpmask & ~(__GFP_HIGHMEM | __GFP_MOVABLE); > + > + head = alloc_page_cgroup_head(base_pfn, nid, gfpmask); > + if (!head) > + return ERR_PTR(-ENOMEM); > + > + pc = &head->pc[pfn & PCGRP_MASK]; > + > + error = radix_tree_preload(gfpmask); > + if (error) > + goto out; > + spin_lock_irqsave(&root->tree_lock, flags); > + error = radix_tree_insert(&root->root_node, idx, head); > + spin_unlock_irqrestore(&root->tree_lock, flags); > + radix_tree_preload_end(); > +out: > + if (error) { > + free_page_cgroup(head); > + if (error == -EEXIST) > + goto retry; > + pc = ERR_PTR(error); > + } > + return pc; > +} > + > +__init int page_cgroup_init(void) static int __init > +{ > + int tmp, nid; > + struct page_cgroup_root *root; > + > + calc_page_cgroup_order(); > + > + page_cgroup_cachep = kmem_cache_create("page_cgroup", > + page_cgroup_head_size, 0, > + SLAB_PANIC | SLAB_DESTROY_BY_RCU, NULL); > + > + if (!page_cgroup_cachep) { > + printk(KERN_ERR "page accouning setup failure\n"); > + printk(KERN_ERR "can't initialize slab memory\n"); why not: printk(KERN_ERR "page accouning setup failure, can't initialize slab memory\n"); otherwise the user may think 2 bad things happened > + return -ENOMEM; > + } > + > + for_each_node(nid) { > + tmp = nid; > + if (!node_state(nid, N_NORMAL_MEMORY)) > + tmp = -1; > + root = kmalloc_node(sizeof(struct page_cgroup_root), > + GFP_KERNEL, tmp); > + if (!root) > + goto unroll; > + INIT_RADIX_TREE(&root->root_node, GFP_ATOMIC); > + spin_lock_init(&root->tree_lock); > + tmp_root_dir[nid] = root; > + } > + /* > + * By filling node_root[], this tree turns to be visible. > + * Because we have to finish initialization of the tree before > + * we make it visible, memory barrier is necessary. > + */ > + smp_wmb(); > + for_each_node(nid) > + root_node[nid] = tmp_root_dir[nid]; > + > + printk(KERN_INFO "Page Accouintg is activated\n"); Accounting > + return 0; > +unroll: > + for_each_node(nid) > + kfree(tmp_root_dir[nid]); > + > + return -ENOMEM; > +} > +late_initcall(page_cgroup_init); > Index: linux-2.6.25-rc5-mm1-k/mm/Makefile > =================================================================== > --- linux-2.6.25-rc5-mm1-k.orig/mm/Makefile > +++ linux-2.6.25-rc5-mm1-k/mm/Makefile > @@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o > obj-$(CONFIG_MIGRATION) += migrate.o > obj-$(CONFIG_SMP) += allocpercpu.o > obj-$(CONFIG_QUICKLIST) += quicklist.o > -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o > +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o > > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org