From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Fri, 26 Sep 2008 11:28:41 +0900 From: KAMEZAWA Hiroyuki Subject: [PATCH(fixed) 11/12] free page cgroup from LRU in add Message-Id: <20080926112841.04be3579.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20080926112544.26312246.kamezawa.hiroyu@jp.fujitsu.com> References: <20080925151124.25898d22.kamezawa.hiroyu@jp.fujitsu.com> <20080925153206.281243dc.kamezawa.hiroyu@jp.fujitsu.com> <20080926112156.991c5dc4.kamezawa.hiroyu@jp.fujitsu.com> <20080926112544.26312246.kamezawa.hiroyu@jp.fujitsu.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Return-Path: To: KAMEZAWA Hiroyuki Cc: "linux-mm@kvack.org" , LKML List-ID: Fixed HUNK with 9/12(fixed) == Delaying add_to_lru() and do it in batched manner like page_vec. For doing that 2 flags PCG_USED and PCG_LRU. If PCG_LRU is set, page is on LRU. It safe to access LRU via page_cgroup. (under some lock.) For avoiding race, this patch uses TestSetPageCgroupUsed(). and checking PCG_USED bit and PCG_LRU bit in add/free vector. By this, lock_page_cgroup() in mem_cgroup_charge() is removed. (I don't want to call lock_page_cgroup() under mz->lru_lock when add/free vector core logic. So, TestSetPageCgroupUsed() logic is added. TestSet is an easy way to avoid unneccesary nest of locks.) Changelog: v3 -> v5. - removed css_get/put per page_cgroup struct. Now, *new* force_empty checks there is page_cgroup on the memcg. We don't need to be afraid of leak. Changelog: v2 -> v3 - added TRANSIT flag and removed lock from core logic. Changelog: v1 -> v2: - renamed function name from use_page_cgroup to set_page_cgroup_lru(). Signed-off-by: KAMEZAWA Hiroyuki include/linux/page_cgroup.h | 10 +++ mm/memcontrol.c | 121 +++++++++++++++++++++++++++++++------------- 2 files changed, 96 insertions(+), 35 deletions(-) Index: mmotm-2.6.27-rc7+/include/linux/page_cgroup.h =================================================================== --- mmotm-2.6.27-rc7+.orig/include/linux/page_cgroup.h +++ mmotm-2.6.27-rc7+/include/linux/page_cgroup.h @@ -24,6 +24,7 @@ enum { PCG_LOCK, /* page cgroup is locked */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ + PCG_LRU, /* this is on LRU */ /* flags for LRU placement */ PCG_ACTIVE, /* page is active in this cgroup */ PCG_FILE, /* page is file system backed */ @@ -42,11 +43,20 @@ static inline void SetPageCgroup##uname( static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ { clear_bit(PCG_##lname, &pc->flags); } +#define TESTSETPCGFLAG(uname, lname)\ +static inline int TestSetPageCgroup##uname(struct page_cgroup *pc) \ + { return test_and_set_bit(PCG_##lname, &pc->flags); } + /* Cache flag is set only once (at allocation) */ TESTPCGFLAG(Cache, CACHE) TESTPCGFLAG(Used, USED) CLEARPCGFLAG(Used, USED) +TESTSETPCGFLAG(Used, USED) + +TESTPCGFLAG(LRU, LRU) +SETPCGFLAG(LRU, LRU) +CLEARPCGFLAG(LRU, LRU) /* LRU management flags (from global-lru definition) */ TESTPCGFLAG(File, FILE) Index: mmotm-2.6.27-rc7+/mm/memcontrol.c =================================================================== --- mmotm-2.6.27-rc7+.orig/mm/memcontrol.c +++ mmotm-2.6.27-rc7+/mm/memcontrol.c @@ -149,9 +149,9 @@ enum charge_type { static const unsigned long pcg_default_flags[NR_CHARGE_TYPE] = { - (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED) | (1 << PCG_LOCK), - (1 << PCG_ACTIVE) | (1 << PCG_LOCK) | (1 << PCG_USED), - (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED)| (1 << PCG_LOCK), + (1 << PCG_CACHE) | (1 << PCG_FILE) | (1 << PCG_USED), + (1 << PCG_ACTIVE) | (1 << PCG_USED), + (1 << PCG_ACTIVE) | (1 << PCG_CACHE) | (1 << PCG_USED), }; /* @@ -193,7 +193,6 @@ page_cgroup_zoneinfo(struct page_cgroup struct mem_cgroup *mem = pc->mem_cgroup; int nid = page_cgroup_nid(pc); int zid = page_cgroup_zid(pc); - return mem_cgroup_zoneinfo(mem, nid, zid); } @@ -341,7 +340,7 @@ void mem_cgroup_move_lists(struct page * if (!trylock_page_cgroup(pc)) return; - if (PageCgroupUsed(pc)) { + if (PageCgroupUsed(pc) && PageCgroupLRU(pc)) { mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); @@ -500,6 +499,9 @@ int mem_cgroup_move_account(struct page from_mz = mem_cgroup_zoneinfo(from, nid, zid); to_mz = mem_cgroup_zoneinfo(to, nid, zid); + if (!PageCgroupLRU(pc)) + return ret; + if (res_counter_charge(&to->res, PAGE_SIZE)) { /* Now, we assume no_limit...no failure here. */ return ret; @@ -516,10 +518,8 @@ int mem_cgroup_move_account(struct page if (spin_trylock(&to_mz->lru_lock)) { __mem_cgroup_remove_list(from_mz, pc); - css_put(&from->css); res_counter_uncharge(&from->res, PAGE_SIZE); pc->mem_cgroup = to; - css_get(&to->css); __mem_cgroup_add_list(to_mz, pc); ret = 0; spin_unlock(&to_mz->lru_lock); @@ -540,6 +540,7 @@ struct memcg_percpu_vec { struct page_cgroup *vec[MEMCG_PCPVEC_SIZE]; }; static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_free_vec); +static DEFINE_PER_CPU(struct memcg_percpu_vec, memcg_add_vec); static void __release_page_cgroup(struct memcg_percpu_vec *mpv) @@ -555,7 +556,6 @@ __release_page_cgroup(struct memcg_percp prev_mz = NULL; for (i = nr - 1; i >= 0; i--) { pc = mpv->vec[i]; - VM_BUG_ON(PageCgroupUsed(pc)); mz = page_cgroup_zoneinfo(pc); if (prev_mz != mz) { if (prev_mz) @@ -563,9 +563,10 @@ __release_page_cgroup(struct memcg_percp prev_mz = mz; spin_lock(&mz->lru_lock); } - __mem_cgroup_remove_list(mz, pc); - css_put(&pc->mem_cgroup->css); - pc->mem_cgroup = NULL; + if (!PageCgroupUsed(pc) && PageCgroupLRU(pc)) { + __mem_cgroup_remove_list(mz, pc); + ClearPageCgroupLRU(pc); + } } if (prev_mz) spin_unlock(&prev_mz->lru_lock); @@ -574,10 +575,43 @@ __release_page_cgroup(struct memcg_percp } static void +__set_page_cgroup_lru(struct memcg_percpu_vec *mpv) +{ + unsigned long flags; + struct mem_cgroup_per_zone *mz, *prev_mz; + struct page_cgroup *pc; + int i, nr; + + local_irq_save(flags); + nr = mpv->nr; + mpv->nr = 0; + prev_mz = NULL; + + for (i = nr - 1; i >= 0; i--) { + pc = mpv->vec[i]; + mz = page_cgroup_zoneinfo(pc); + if (prev_mz != mz) { + if (prev_mz) + spin_unlock(&prev_mz->lru_lock); + prev_mz = mz; + spin_lock(&mz->lru_lock); + } + if (PageCgroupUsed(pc) && !PageCgroupLRU(pc)) { + SetPageCgroupLRU(pc); + __mem_cgroup_add_list(mz, pc); + } + } + + if (prev_mz) + spin_unlock(&prev_mz->lru_lock); + local_irq_restore(flags); + +} + +static void release_page_cgroup(struct page_cgroup *pc) { struct memcg_percpu_vec *mpv; - mpv = &get_cpu_var(memcg_free_vec); mpv->vec[mpv->nr++] = pc; if (mpv->nr >= mpv->limit) @@ -585,11 +619,25 @@ release_page_cgroup(struct page_cgroup * put_cpu_var(memcg_free_vec); } +static void +set_page_cgroup_lru(struct page_cgroup *pc) +{ + struct memcg_percpu_vec *mpv; + + mpv = &get_cpu_var(memcg_add_vec); + mpv->vec[mpv->nr++] = pc; + if (mpv->nr >= mpv->limit) + __set_page_cgroup_lru(mpv); + put_cpu_var(memcg_add_vec); +} + static void page_cgroup_start_cache_cpu(int cpu) { struct memcg_percpu_vec *mpv; mpv = &per_cpu(memcg_free_vec, cpu); mpv->limit = MEMCG_PCPVEC_SIZE; + mpv = &per_cpu(memcg_add_vec, cpu); + mpv->limit = MEMCG_PCPVEC_SIZE; } #ifdef CONFIG_HOTPLUG_CPU @@ -598,6 +646,8 @@ static void page_cgroup_stop_cache_cpu(i struct memcg_percpu_vec *mpv; mpv = &per_cpu(memcg_free_vec, cpu); mpv->limit = 0; + mpv = &per_cpu(memcg_add_vec, cpu); + mpv->limit = 0; } #endif @@ -611,6 +661,9 @@ static DEFINE_MUTEX(memcg_force_drain_mu static void drain_page_cgroup_local(struct work_struct *work) { struct memcg_percpu_vec *mpv; + mpv = &get_cpu_var(memcg_add_vec); + __set_page_cgroup_lru(mpv); + put_cpu_var(mpv); mpv = &get_cpu_var(memcg_free_vec); __release_page_cgroup(mpv); put_cpu_var(mpv); @@ -677,14 +730,9 @@ static int mem_cgroup_charge_common(stru rcu_read_unlock(); return 0; } - /* - * For every charge from the cgroup, increment reference count - */ - css_get(&mem->css); rcu_read_unlock(); } else { mem = memcg; - css_get(&memcg->css); } while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { @@ -711,33 +759,36 @@ static int mem_cgroup_charge_common(stru } preempt_disable(); - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); + if (TestSetPageCgroupUsed(pc)) { res_counter_uncharge(&mem->res, PAGE_SIZE); - css_put(&mem->css); preempt_enable(); goto done; } - pc->mem_cgroup = mem; /* - * If a page is accounted as a page cache, insert to inactive list. - * If anon, insert to active list. - */ - pc->flags = pcg_default_flags[ctype]; - - mz = page_cgroup_zoneinfo(pc); + * page cgroup is *unused* now....but.... + * We can assume old mem_cgroup's metadata is still available + * because pc is not on stale LRU after force_empty() is called. + */ + if (likely(!PageCgroupLRU(pc))) + pc->flags = pcg_default_flags[ctype]; + else { + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + if (PageCgroupLRU(pc)) { + __mem_cgroup_remove_list(mz, pc); + ClearPageCgroupLRU(pc); + } + pc->flags = pcg_default_flags[ctype]; + spin_unlock_irqrestore(&mz->lru_lock, flags); + } - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); - unlock_page_cgroup(pc); + pc->mem_cgroup = mem; + set_page_cgroup_lru(pc); preempt_enable(); done: return 0; out: - css_put(&mem->css); return -ENOMEM; } @@ -823,12 +874,12 @@ __mem_cgroup_uncharge_common(struct page preempt_disable(); lock_page_cgroup(pc); ClearPageCgroupUsed(pc); + mem = pc->mem_cgroup; unlock_page_cgroup(pc); preempt_enable(); + res_counter_uncharge(&mem->res, PAGE_SIZE); - mem = pc->mem_cgroup; release_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); return; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org