From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Thu, 27 Mar 2008 17:50:20 +0900 From: KAMEZAWA Hiroyuki Subject: [-mm] [PATCH 3/4] memcg : shirink page cgroup Message-Id: <20080327175020.fe916e43.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20080327174435.e69f5b45.kamezawa.hiroyu@jp.fujitsu.com> References: <20080327174435.e69f5b45.kamezawa.hiroyu@jp.fujitsu.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Return-Path: To: KAMEZAWA Hiroyuki Cc: "linux-mm@kvack.org" , "balbir@linux.vnet.ibm.com" , lizf@cn.fujitsu.com, a.p.zijlstra@chello.nl List-ID: This patch is for freeing page_cgroup if a chunk of pages are freed. How this works * when the order of free page reaches PCGRP_SHRINK_ORDER, pcgrp is freed. This will be done by RCU. Changelog v1 -> v2: * shrink_order is automatically calculated. * added comments. * added cpu mask to page_cgroup_head. (this will be help at clearing.) * moved a routine for flushing percpu idx after deletion in radix-tree. * removed memory barrier. (I noticed that clearing can be done after lock/unlock.) * add a sanity check not to access a page_cgroup for a page whose refcnt is 0. Signed-off-by: KAMEZAWA Hiroyuki include/linux/page_cgroup.h | 13 ++++++++ mm/page_alloc.c | 2 + mm/page_cgroup.c | 70 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 2 deletions(-) Index: linux-2.6.25-rc5-mm1-k/mm/page_cgroup.c =================================================================== --- linux-2.6.25-rc5-mm1-k.orig/mm/page_cgroup.c +++ linux-2.6.25-rc5-mm1-k/mm/page_cgroup.c @@ -31,12 +31,14 @@ static int page_cgroup_order __read_mostly; static int page_cgroup_head_size __read_mostly; +int pcgroup_shrink_order __read_mostly; #define PCGRP_SHIFT (page_cgroup_order) #define PCGRP_SIZE (1 << PCGRP_SHIFT) #define PCGRP_MASK (PCGRP_SIZE - 1) struct page_cgroup_head { + cpumask_t mask; struct page_cgroup pc[0]; }; @@ -61,6 +63,10 @@ static void calc_page_cgroup_order(void) page_cgroup_order = order; page_cgroup_head_size = sizeof(struct page_cgroup_head) + (sizeof(struct page_cgroup) << order); + if (order + 1 < MAX_ORDER) + pcgroup_shrink_order = order + 1; + else + pcgroup_shrink_order = MAX_ORDER - 1; } static struct page_cgroup_root __initdata *tmp_root_dir[MAX_NUMNODES]; @@ -72,7 +78,7 @@ init_page_cgroup_head(struct page_cgroup struct page *page; struct page_cgroup *pc; int i; - + cpus_clear(head->mask); for (i = 0, page = pfn_to_page(pfn), pc = &head->pc[0]; i < PCGRP_SIZE; i++, page++, pc++) { pc->refcnt = 0; @@ -120,6 +126,7 @@ static void cache_result(unsigned long i pcp = &get_cpu_var(pcpu_pcgroup_cache); pcp->ents[hnum].idx = idx; pcp->ents[hnum].base = &head->pc[0] - (idx << PCGRP_SHIFT); + cpu_set(smp_processor_id(), head->mask); put_cpu_var(pcpu_pcgroup_cache); } @@ -150,6 +157,8 @@ static struct page_cgroup_root *pcgroup_ VM_BUG_ON(!page); + VM_BUG_ON(!page_count(page)); + nid = page_to_nid(page); return root_node[nid]; @@ -257,6 +266,62 @@ out: } return pc; } +/* + * This will be called from deep place in free_pages() + * Because this is called by page allocator, we can assume that + * 1. zone->lock is held. + * 2. no one touches pages in [page...page + (1 << order)) + * 3. Because of 2, page_cgroups against [page...page + (1 << order)] + * is not touched and will not be touched while we hold zone->lock. + */ +void __shrink_page_cgroup(struct page *page, int order) +{ + struct page_cgroup_root *root; + unsigned long pfn = page_to_pfn(page); + unsigned long end_pfn; + int cpu; + + root = pcgroup_get_root(page); + if (in_interrupt() || !root || (order < PCGRP_SHIFT)) + return; + + pfn = page_to_pfn(page); + end_pfn = pfn + (1 << order); + + while (pfn != end_pfn) { + if (spin_trylock(&root->tree_lock)) { + struct page_cgroup_cache *pcp; + struct page_cgroup_head *head = NULL; + int idx = pfn >> PCGRP_SHIFT; + /* + * Because [pfn, end_pfn) are free pages, we can assume + * no lookup in this range will occur. So this has no + * race. For rafix-tree, we have to take a lock. + * Radix tree is freed by RCU. so they will not call + * free_pages() directly under this. + */ + head = radix_tree_delete(&root->root_node, idx); + spin_unlock(&root->tree_lock); + + /* + * It's guaranteed that no one has access to this pfn + * because there isn't and won't be access to the page + * and page_cgroup. + */ + if (head) { + int hnum = hashfunc(idx); + for_each_cpu_mask(cpu, head->mask) { + pcp = &per_cpu(pcpu_pcgroup_cache, cpu); + if (pcp->ents[hnum].idx == idx) + pcp->ents[hnum].base = NULL; + } + /* SLAB for head is SLAB_DESTROY_BY_RCU. */ + free_page_cgroup(head); + } + } + pfn += PCGRP_SIZE; + } +} __init int page_cgroup_init(void) { @@ -296,7 +361,8 @@ __init int page_cgroup_init(void) for_each_node(nid) root_node[nid] = tmp_root_dir[nid]; - printk(KERN_INFO "Page Accouintg is activated\n"); + printk(KERN_INFO "Page Accouintg is activated, its order is %d\n", + page_cgroup_order); return 0; unroll: for_each_node(nid) Index: linux-2.6.25-rc5-mm1-k/mm/page_alloc.c =================================================================== --- linux-2.6.25-rc5-mm1-k.orig/mm/page_alloc.c +++ linux-2.6.25-rc5-mm1-k/mm/page_alloc.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -463,6 +464,7 @@ static inline void __free_one_page(struc order++; } set_page_order(page, order); + shrink_page_cgroup(page, order); list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); zone->free_area[order].nr_free++; Index: linux-2.6.25-rc5-mm1-k/include/linux/page_cgroup.h =================================================================== --- linux-2.6.25-rc5-mm1-k.orig/include/linux/page_cgroup.h +++ linux-2.6.25-rc5-mm1-k/include/linux/page_cgroup.h @@ -40,6 +40,15 @@ extern struct page_cgroup *get_page_cgro extern struct page_cgroup * get_alloc_page_cgroup(struct page *page, gfp_t gfpmask); +extern int pcgroup_shrink_order; +extern void __shrink_page_cgroup(struct page *page, int order); + +static inline void shrink_page_cgroup(struct page *page, int order) +{ + if (unlikely(order >= pcgroup_shrink_order)) + __shrink_page_cgroup(page, order); +} + #else static inline struct page_cgroup *get_page_cgroup(struct page *page) @@ -52,5 +61,9 @@ get_alloc_page_cgroup(struct page *page, { return NULL; } +static inline void +shrink_page_cgroup(struct page *page, int order) +{ +} #endif #endif -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org