On Mon, Apr 25, 2011 at 2:43 AM, KAMEZAWA Hiroyuki < kamezawa.hiroyu@jp.fujitsu.com> wrote: > At tuning memcg background reclaim, cpu usage per memcg's work is an > interesting information because some amount of shared resource is used. > (i.e. background reclaim uses workqueue.) And other information as > pgscan and pgreclaim is important. > > This patch shows them via memory.stat with cpu usage for direct reclaim > and softlimit reclaim and page scan statistics. > > > # cat /cgroup/memory/A/memory.stat > .... > direct_elapsed_ns 0 > soft_elapsed_ns 0 > wmark_elapsed_ns 103566424 > direct_scanned 0 > soft_scanned 0 > wmark_scanned 29303 > direct_freed 0 > soft_freed 0 > wmark_freed 29290 > > > Signed-off-by: KAMEZAWA Hiroyuki > --- > Documentation/cgroups/memory.txt | 18 +++++++++ > include/linux/memcontrol.h | 6 +++ > include/linux/swap.h | 7 +++ > mm/memcontrol.c | 77 > +++++++++++++++++++++++++++++++++++++-- > mm/vmscan.c | 15 +++++++ > 5 files changed, 120 insertions(+), 3 deletions(-) > > Index: memcg/mm/memcontrol.c > =================================================================== > --- memcg.orig/mm/memcontrol.c > +++ memcg/mm/memcontrol.c > @@ -274,6 +274,17 @@ struct mem_cgroup { > bool bgreclaim_resched; > struct delayed_work bgreclaim_work; > /* > + * reclaim statistics (not per zone, node) > + */ > + spinlock_t elapsed_lock; > + u64 bgreclaim_elapsed; > + u64 direct_elapsed; > + u64 soft_elapsed; > + > + u64 reclaim_scan[NR_RECLAIM_CONTEXTS]; > + u64 reclaim_freed[NR_RECLAIM_CONTEXTS]; > + > + /* > * Should we move charges of a task when a task is moved into this > * mem_cgroup ? And what type of charges should we move ? > */ > @@ -1346,6 +1357,18 @@ void mem_cgroup_clear_unreclaimable(stru > return; > } > > +void mem_cgroup_reclaim_statistics(struct mem_cgroup *mem, > + int context, unsigned long scanned, > + unsigned long freed) > +{ > + if (!mem) > + return; > + spin_lock(&mem->elapsed_lock); > + mem->reclaim_scan[context] += scanned; > + mem->reclaim_freed[context] += freed; > + spin_unlock(&mem->elapsed_lock); > +} > + > unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, > struct list_head *dst, > unsigned long *scanned, int order, > @@ -1692,6 +1715,7 @@ static int mem_cgroup_hierarchical_recla > bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; > unsigned long excess; > unsigned long nr_scanned; > + s64 start, end; > > excess = res_counter_soft_limit_excess(&root_mem->res) >> > PAGE_SHIFT; > > @@ -1735,16 +1759,27 @@ static int mem_cgroup_hierarchical_recla > } > /* we use swappiness of local cgroup */ > if (check_soft) { > + start = sched_clock(); > ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, > noswap, mem_cgroup_swappiness(victim), zone, > &nr_scanned); > *total_scanned += nr_scanned; > + end = sched_clock(); > + spin_lock(&victim->elapsed_lock); > + victim->soft_elapsed += end - start; > + spin_unlock(&victim->elapsed_lock); > mem_cgroup_soft_steal(victim, ret); > mem_cgroup_soft_scan(victim, nr_scanned); > - } else > + } else { > + start = sched_clock(); > ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, > noswap, > > mem_cgroup_swappiness(victim)); > + end = sched_clock(); > + spin_lock(&victim->elapsed_lock); > + victim->direct_elapsed += end - start; > + spin_unlock(&victim->elapsed_lock); > + } > css_put(&victim->css); > /* > * At shrinking usage, we can't check we should stop here or > @@ -3702,15 +3737,22 @@ static void memcg_bgreclaim(struct work_ > struct delayed_work *dw = to_delayed_work(work); > struct mem_cgroup *mem = > container_of(dw, struct mem_cgroup, bgreclaim_work); > - int delay = 0; > + int delay; > unsigned long long required, usage, hiwat; > > + delay = 0; > hiwat = res_counter_read_u64(&mem->res, RES_HIGH_WMARK_LIMIT); > usage = res_counter_read_u64(&mem->res, RES_USAGE); > required = usage - hiwat; > if (required >= 0) { > + u64 start, end; > required = ((usage - hiwat) >> PAGE_SHIFT) + 1; > + start = sched_clock(); > delay = shrink_mem_cgroup(mem, (long)required); > + end = sched_clock(); > + spin_lock(&mem->elapsed_lock); > + mem->bgreclaim_elapsed += end - start; > + spin_unlock(&mem->elapsed_lock); > } > if (!mem->bgreclaim_resched || > mem_cgroup_watermark_ok(mem, CHARGE_WMARK_HIGH)) { > @@ -4152,6 +4194,15 @@ enum { > MCS_INACTIVE_FILE, > MCS_ACTIVE_FILE, > MCS_UNEVICTABLE, > + MCS_DIRECT_ELAPSED, > + MCS_SOFT_ELAPSED, > + MCS_WMARK_ELAPSED, > + MCS_DIRECT_SCANNED, > + MCS_SOFT_SCANNED, > + MCS_WMARK_SCANNED, > + MCS_DIRECT_FREED, > + MCS_SOFT_FREED, > + MCS_WMARK_FREED, > NR_MCS_STAT, > }; > > @@ -4177,7 +4228,16 @@ struct { > {"active_anon", "total_active_anon"}, > {"inactive_file", "total_inactive_file"}, > {"active_file", "total_active_file"}, > - {"unevictable", "total_unevictable"} > + {"unevictable", "total_unevictable"}, > + {"direct_elapsed_ns", "total_direct_elapsed_ns"}, > + {"soft_elapsed_ns", "total_soft_elapsed_ns"}, > + {"wmark_elapsed_ns", "total_wmark_elapsed_ns"}, > + {"direct_scanned", "total_direct_scanned"}, > + {"soft_scanned", "total_soft_scanned"}, > + {"wmark_scanned", "total_wmark_scanned"}, > + {"direct_freed", "total_direct_freed"}, > + {"soft_freed", "total_soft_freed"}, > + {"wmark_freed", "total_wamrk_freed"} > }; > > > @@ -4185,6 +4245,7 @@ static void > mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat > *s) > { > s64 val; > + int i; > > /* per cpu stat */ > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); > @@ -4221,6 +4282,15 @@ mem_cgroup_get_local_stat(struct mem_cgr > s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; > val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); > s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; > + > + /* reclaim stats */ > + s->stat[MCS_DIRECT_ELAPSED] += mem->direct_elapsed; > + s->stat[MCS_SOFT_ELAPSED] += mem->soft_elapsed; > + s->stat[MCS_WMARK_ELAPSED] += mem->bgreclaim_elapsed; > + for (i = 0; i < NR_RECLAIM_CONTEXTS; i++) { > + s->stat[i + MCS_DIRECT_SCANNED] += mem->reclaim_scan[i]; > + s->stat[i + MCS_DIRECT_FREED] += mem->reclaim_freed[i]; > + } > } > > static void > @@ -4889,6 +4959,7 @@ static struct mem_cgroup *mem_cgroup_all > goto out_free; > spin_lock_init(&mem->pcp_counter_lock); > INIT_DELAYED_WORK(&mem->bgreclaim_work, memcg_bgreclaim); > + spin_lock_init(&mem->elapsed_lock); > mem->bgreclaim_resched = true; > return mem; > > Index: memcg/include/linux/memcontrol.h > =================================================================== > --- memcg.orig/include/linux/memcontrol.h > +++ memcg/include/linux/memcontrol.h > @@ -90,6 +90,8 @@ extern int mem_cgroup_select_victim_node > const nodemask_t *nodes); > > int shrink_mem_cgroup(struct mem_cgroup *mem, long required); > +void mem_cgroup_reclaim_statistics(struct mem_cgroup *mem, int context, > + unsigned long scanned, unsigned long freed); > > static inline > int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup > *cgroup) > @@ -423,6 +425,10 @@ static inline > void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item > idx) > { > } > +void mem_cgroup_reclaim_statistics(struct mem_cgroup *mem, int context, > + unsigned long scanned, unsigned long freed) > +{ > +} > #endif /* CONFIG_CGROUP_MEM_CONT */ > > #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) > Index: memcg/include/linux/swap.h > =================================================================== > --- memcg.orig/include/linux/swap.h > +++ memcg/include/linux/swap.h > @@ -250,6 +250,13 @@ static inline void lru_cache_add_file(st > #define ISOLATE_ACTIVE 1 /* Isolate active pages. */ > #define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. > */ > > +/* context for memory reclaim.( comes from memory cgroup.) */ > +enum { > + RECLAIM_DIRECT, /* under direct reclaim */ > + RECLAIM_KSWAPD, /* under global kswapd's soft limit */ > + RECLAIM_WMARK, /* under background reclaim by watermark */ > + NR_RECLAIM_CONTEXTS > +}; > /* linux/mm/vmscan.c */ > extern unsigned long try_to_free_pages(struct zonelist *zonelist, int > order, > gfp_t gfp_mask, nodemask_t *mask); > Index: memcg/mm/vmscan.c > =================================================================== > --- memcg.orig/mm/vmscan.c > +++ memcg/mm/vmscan.c > @@ -72,6 +72,9 @@ typedef unsigned __bitwise__ reclaim_mod > #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) > #define RECLAIM_MODE_COMPACTION ((__force > reclaim_mode_t)0x10u) > > +/* 3 reclaim contexts fro memcg statistics. */ > +enum {DIRECT_RECLAIM, KSWAPD_RECLAIM, WMARK_RECLAIM}; > + > struct scan_control { > /* Incremented by the number of inactive pages that were scanned */ > unsigned long nr_scanned; > @@ -107,6 +110,7 @@ struct scan_control { > > /* Which cgroup do we reclaim from */ > struct mem_cgroup *mem_cgroup; > + int reclaim_context; > > /* > * Nodemask of nodes allowed by the caller. If NULL, all nodes > @@ -2116,6 +2120,10 @@ out: > delayacct_freepages_end(); > put_mems_allowed(); > > + if (!scanning_global_lru(sc)) > + mem_cgroup_reclaim_statistics(sc->mem_cgroup, > + sc->reclaim_context, total_scanned, > sc->nr_reclaimed); > + > if (sc->nr_reclaimed) > return sc->nr_reclaimed; > > @@ -2178,6 +2186,7 @@ unsigned long mem_cgroup_shrink_node_zon > .swappiness = swappiness, > .order = 0, > .mem_cgroup = mem, > + .reclaim_context = RECLAIM_KSWAPD, > }; > > sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | > @@ -2198,6 +2207,8 @@ unsigned long mem_cgroup_shrink_node_zon > > trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); > > + mem_cgroup_reclaim_statistics(sc.mem_cgroup, > + sc.reclaim_context, sc.nr_scanned, > sc.nr_reclaimed); > *nr_scanned = sc.nr_scanned; > return sc.nr_reclaimed; > } > @@ -2217,6 +2228,7 @@ unsigned long try_to_free_mem_cgroup_pag > .swappiness = swappiness, > .order = 0, > .mem_cgroup = mem_cont, > + .reclaim_context = RECLAIM_DIRECT, > .nodemask = NULL, /* we don't care the placement */ > }; > > @@ -2384,6 +2396,7 @@ int shrink_mem_cgroup(struct mem_cgroup > .may_swap = 1, > .order = 0, > .mem_cgroup = mem, > + .reclaim_context = RECLAIM_WMARK, > }; > /* writepage will be set later per zone */ > sc.may_writepage = 0; > @@ -2434,6 +2447,8 @@ int shrink_mem_cgroup(struct mem_cgroup > if (sc.nr_reclaimed > sc.nr_to_reclaim/2) > delay = 0; > out: > + mem_cgroup_reclaim_statistics(sc.mem_cgroup, sc.reclaim_context, > + total_scanned, sc.nr_reclaimed); > current->flags &= ~PF_SWAPWRITE; > return delay; > } > Index: memcg/Documentation/cgroups/memory.txt > =================================================================== > --- memcg.orig/Documentation/cgroups/memory.txt > +++ memcg/Documentation/cgroups/memory.txt > @@ -398,6 +398,15 @@ active_anon - # of bytes of anonymous an > inactive_file - # of bytes of file-backed memory on inactive LRU list. > active_file - # of bytes of file-backed memory on active LRU list. > unevictable - # of bytes of memory that cannot be reclaimed (mlocked > etc). > +direct_elapsed_ns - # of elapsed cpu time at hard limit reclaim (ns) > +soft_elapsed_ns - # of elapsed cpu time at soft limit reclaim (ns) > +wmark_elapsed_ns - # of elapsed cpu time at hi/low watermark reclaim (ns) > +direct_scanned - # of page scans at hard limit reclaim > +soft_scanned - # of page scans at soft limit reclaim > +wmark_scanned - # of page scans at hi/low watermark reclaim > +direct_freed - # of page freeing at hard limit reclaim > +soft_freed - # of page freeing at soft limit reclaim > +wmark_freed - # of page freeing at hi/low watermark reclaim > > # status considering hierarchy (see memory.use_hierarchy settings) > > @@ -421,6 +430,15 @@ total_active_anon - sum of all children' > total_inactive_file - sum of all children's "inactive_file" > total_active_file - sum of all children's "active_file" > total_unevictable - sum of all children's "unevictable" > +total_direct_elapsed_ns - sum of all children's "direct_elapsed_ns" > +total_soft_elapsed_ns - sum of all children's "soft_elapsed_ns" > +total_wmark_elapsed_ns - sum of all children's "wmark_elapsed_ns" > +total_direct_scanned - sum of all children's "direct_scanned" > +total_soft_scanned - sum of all children's "soft_scanned" > +total_wmark_scanned - sum of all children's "wmark_scanned" > +total_direct_freed - sum of all children's "direct_freed" > +total_soft_freed - sum of all children's "soft_freed" > +total_wamrk_freed - sum of all children's "wmark_freed" > > # The following additional stats are dependent on CONFIG_DEBUG_VM. > > Those stats looks good to me. Thanks --Ying