From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Wed, 2 Jul 2008 21:17:38 +0900 From: KAMEZAWA Hiroyuki Subject: [RFC][-mm] [7/7] background job for memcg Message-Id: <20080702211738.f2ae389b.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20080702210322.518f6c43.kamezawa.hiroyu@jp.fujitsu.com> References: <20080702210322.518f6c43.kamezawa.hiroyu@jp.fujitsu.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org Return-Path: To: KAMEZAWA Hiroyuki Cc: "linux-mm@kvack.org" , "balbir@linux.vnet.ibm.com" , "xemul@openvz.org" , "nishimura@mxp.nes.nec.co.jp" , "yamamoto@valinux.co.jp" , "hugh@veritas.com" , "kosaki.motohiro@jp.fujitsu.com" List-ID: Background relcaim for memcg. This patch adds a daemon to do background page reclaim based on high/low watermark for memcg. Almost all codes are rewritten from previous one. So start from numbering v1, again. Major changes from old one: i)changes the reclaim to be started based on distance to the limit. Not to the amount of pages. By this, we don't need strict check against low < high < limit. low and high are guaranteed to be below limit. ii)"A" daemon is used instead of daemons per memcg. maybe simpler than previous ones. (But maybe it's ok to start per-node thread. It's TODO.) iii) Because of ii), memcg->flags is added. Note: I tried to use work_queue but it seems it's not suitable for a work with loop. So, I added kthread. BTW, if you think of better name, please tell me. Signed-off-by: KAMEZAWA Hiroyuki mm/memcontrol.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 196 insertions(+), 5 deletions(-) Index: test-2.6.26-rc5-mm3++/mm/memcontrol.c =================================================================== --- test-2.6.26-rc5-mm3++.orig/mm/memcontrol.c +++ test-2.6.26-rc5-mm3++/mm/memcontrol.c @@ -11,7 +11,7 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * - * This program is distributed in the hope that it will be useful, + * This program is distributed inx the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. @@ -33,13 +33,31 @@ #include #include #include - +#include +#include +#include +#include #include struct cgroup_subsys mem_cgroup_subsys __read_mostly; static struct kmem_cache *page_cgroup_cache __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +/* Background reclaim stuff */ +static LIST_HEAD(memcg_global_mswapd_list); +static DECLARE_WAIT_QUEUE_HEAD(memcg_mswapdq); +static DEFINE_SPINLOCK(memcg_mswapd_lock); +struct task_struct *mswapd_daemon __read_mostly; + +enum { + MEMCG_HWMARK = 10000, + MEMCG_LWMARK = 10001, +}; + + +/* An interface for waiting the end of background reclaim */ +static DECLARE_WAIT_QUEUE_HEAD(memcg_destroy_waitq); + /* * Statistics for memory cgroup. */ @@ -129,6 +147,11 @@ struct mem_cgroup { struct mem_cgroup_lru_info info; int prev_priority; /* for recording reclaim priority */ + unsigned long flags; + /* background reclaim stuff */ + unsigned long long highwmrk_distance; + unsigned long long lowwmrk_distance; + struct list_head mswapd_list; /* * statistics. */ @@ -136,6 +159,13 @@ struct mem_cgroup { }; static struct mem_cgroup init_mem_cgroup; +/* Flag bit for memcg itself */ +enum { + MEMCG_FLAG_IN_RECLAIM, + MEMCG_FLAG_OBSOLETE, +}; + + /* * We use the lower bit of the page->page_cgroup pointer as a bit spin * lock. We need to ensure that page->page_cgroup is at least two @@ -504,6 +534,37 @@ unsigned long mem_cgroup_isolate_pages(u return nr_taken; } + +static void mem_cgroup_schedule_reclaim(struct mem_cgroup *memcg) +{ + unsigned long flags; + + if (unlikely(!mswapd_daemon)) + return; + + if (!test_and_set_bit(MEMCG_FLAG_IN_RECLAIM, &memcg->flags)) { + /* When OBSOLETE is marked, there is no thread in this group */ + BUG_ON(test_bit(MEMCG_FLAG_OBSOLETE, &memcg->flags)); + + spin_lock_irqsave(&memcg_mswapd_lock, flags); + BUG_ON(!list_empty(&memcg->mswapd_list)); + css_get(&memcg->css); + list_add_tail(&memcg->mswapd_list, &memcg_global_mswapd_list); + spin_unlock_irqrestore(&memcg_mswapd_lock, flags); + if (!waitqueue_active(&memcg_mswapdq)) + return; + wake_up_interruptible(&memcg_mswapdq); + } +} + + +static void mem_cgroup_check_distance_to_limit(struct mem_cgroup *memcg, + unsigned long long distance) +{ + if (distance < memcg->highwmrk_distance) + mem_cgroup_schedule_reclaim(memcg); +} + /* * Charge the memory controller for page usage. * Return @@ -518,6 +579,7 @@ static int mem_cgroup_charge_common(stru struct page_cgroup *pc; unsigned long flags; unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + unsigned long long distance; struct mem_cgroup_per_zone *mz; pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); @@ -543,7 +605,7 @@ static int mem_cgroup_charge_common(stru css_get(&memcg->css); } - while (res_counter_charge(&mem->res, PAGE_SIZE)) { + while (res_counter_charge_distance(&mem->res, PAGE_SIZE, &distance)) { if (!(gfp_mask & __GFP_WAIT)) goto out; @@ -596,6 +658,9 @@ static int mem_cgroup_charge_common(stru spin_unlock_irqrestore(&mz->lru_lock, flags); unlock_page_cgroup(page); + /* O.K. we successfully charged. check thresholds... + should check gfp flag ? */ + mem_cgroup_check_distance_to_limit(mem, distance); done: return 0; out: @@ -834,6 +899,68 @@ int mem_cgroup_shrink_usage(struct mm_st return 0; } +/* + * a daemon to do backgound page shrinking within memcg. + */ +static int memcg_mswapd(void *data) +{ + DEFINE_WAIT(wait); + int ret; + struct mem_cgroup *memcg; + unsigned long distance; + + current->flags |= PF_SWAPWRITE; + set_user_nice(current, 0); + set_freezable(); + + while (!kthread_should_stop()) { + prepare_to_wait(&memcg_mswapdq, &wait, TASK_INTERRUPTIBLE); + + /* Is there scheduled one ? */ + spin_lock_irq(&memcg_mswapd_lock); + if (list_empty(&memcg_global_mswapd_list)) { + spin_unlock_irq(&memcg_mswapd_lock); + if (!kthread_should_stop()) { + schedule(); + try_to_freeze(); + } + finish_wait(&memcg_mswapdq, &wait); + continue; + } + memcg = container_of(memcg_global_mswapd_list.next, + struct mem_cgroup, mswapd_list); + list_del_init(&memcg->mswapd_list); + spin_unlock_irq(&memcg_mswapd_lock); + + finish_wait(&memcg_mswapdq, &wait); + + if (!test_bit(MEMCG_FLAG_OBSOLETE, &memcg->flags)) { + ret = try_to_free_mem_cgroup_pages(memcg, + GFP_HIGHUSER_MOVABLE); + distance = res_counter_distance_to_limit(&memcg->res); + } else + distance = 0; + + if (distance < memcg->lowwmrk_distance) { + /* Don't clear IN_RECLAIM flag and add to tail */ + spin_lock_irq(&memcg_mswapd_lock); + list_add_tail(&memcg->mswapd_list, + &memcg_global_mswapd_list); + spin_unlock_irq(&memcg_mswapd_lock); + } else { + css_put(&memcg->css); + clear_bit(MEMCG_FLAG_IN_RECLAIM, &memcg->flags); + wake_up_all(&memcg_destroy_waitq); + } + yield(); + } + /* currently, stop this thread is not implemented, but maybe + in future. */ + BUG(); + return 0; +} + + int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { @@ -931,8 +1058,17 @@ out: static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, - cft->private); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + + switch (cft->private) { + case MEMCG_HWMARK: + return memcg->highwmrk_distance; + case MEMCG_LWMARK: + return memcg->lowwmrk_distance; + default: + break; + } + return res_counter_read_u64(&memcg->res, cft->private); } /* * The user of this function is... @@ -952,6 +1088,24 @@ static int mem_cgroup_write(struct cgrou if (!ret) ret = mem_cgroup_resize_limit(memcg, val); break; + case MEMCG_HWMARK: + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) { + if (val <= memcg->lowwmrk_distance) + memcg->highwmrk_distance = val; + else + ret = -EINVAL; + } + break; + case MEMCG_LWMARK: + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) { + if (val >= memcg->highwmrk_distance) + memcg->lowwmrk_distance = val; + else + ret = -EINVAL; + } + break; default: ret = -EINVAL; /* should be BUG() ? */ break; @@ -1063,6 +1217,18 @@ static struct cftype mem_cgroup_files[] .name = "stat", .read_map = mem_control_stat_show, }, + { + .name = "start_reclaim_distance", + .private = MEMCG_HWMARK, + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { + .name = "stop_reclaim_distance", + .private = MEMCG_LWMARK, + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, }; static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) @@ -1124,6 +1290,19 @@ static void mem_cgroup_free(struct mem_c vfree(mem); } +static int mem_cgroup_start_daemon(void) +{ + struct task_struct *result; + int ret = 0; + result = kthread_run(memcg_mswapd, NULL, "memcontrol"); + if (IS_ERR(result)) { + printk("failed to start memory controller daemon\n"); + mswapd_daemon = NULL; + } else + mswapd_daemon = result; + return ret; +} +late_initcall(mem_cgroup_start_daemon); static struct cgroup_subsys_state * mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) @@ -1146,12 +1325,15 @@ mem_cgroup_create(struct cgroup_subsys * if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + /* mem->flags is cleared by memset() */ + return &mem->css; free_out: for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); if (cont->parent != NULL) mem_cgroup_free(mem); + return ERR_PTR(-ENOMEM); } @@ -1159,7 +1341,16 @@ static void mem_cgroup_pre_destroy(struc struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + + set_bit(MEMCG_FLAG_OBSOLETE, &mem->flags); + if (mswapd_daemon) + wake_up(mswapd_daemon); + /* wait for being removed from background reclaim queue */ + wait_event_interruptible(memcg_destroy_waitq, + !(test_bit(MEMCG_FLAG_IN_RECLAIM, &mem->flags))); + mem_cgroup_force_empty(mem); + } static void mem_cgroup_destroy(struct cgroup_subsys *ss, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org