Re: [PATCH 5/7] shrink support for memcg kmem controller

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Glauber Costa <glommer@parallels.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: cgroups@vger.kernel.org, devel@openvz.org, linux-mm@kvack.org,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	Greg Thelen <gthelen@google.com>,
	Johannes Weiner <jweiner@redhat.com>,
	Michal Hocko <mhocko@suse.cz>, Paul Turner <pjt@google.com>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Pekka Enberg <penberg@kernel.org>,
	Christoph Lameter <cl@linux.com>
Subject: Re: [PATCH 5/7] shrink support for memcg kmem controller
Date: Wed, 22 Feb 2012 18:53:25 +0400	[thread overview]
Message-ID: <4F450165.6090204@parallels.com> (raw)
In-Reply-To: <20120222104256.136b8393.kamezawa.hiroyu@jp.fujitsu.com>

On 02/22/2012 05:42 AM, KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Feb 2012 15:34:37 +0400
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch adds the shrinker interface to memcg proposed kmem
>> controller. With this, softlimits starts being meaningful. I didn't
>> played to much with softlimits itself, since it is a bit in progress
>> for the general case as well. But this patch at least makes vmscan.c
>> no longer skip shrink_slab for the memcg case.
>>
>> It also allows us to set the hard limit to a lower value than
>> current usage, as it is possible for the current memcg: a reclaim
>> is carried on, and if we succeed in freeing enough of kernel memory,
>> we can lower the limit.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: Kirill A. Shutemov<kirill@shutemov.name>
>> CC: Greg Thelen<gthelen@google.com>
>> CC: Johannes Weiner<jweiner@redhat.com>
>> CC: Michal Hocko<mhocko@suse.cz>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Paul Turner<pjt@google.com>
>> CC: Frederic Weisbecker<fweisbec@gmail.com>
>> CC: Pekka Enberg<penberg@kernel.org>
>> CC: Christoph Lameter<cl@linux.com>
>> ---
>>   include/linux/memcontrol.h |    5 +++
>>   include/linux/shrinker.h   |    4 ++
>>   mm/memcontrol.c            |   87 ++++++++++++++++++++++++++++++++++++++++++--
>>   mm/vmscan.c                |   60 +++++++++++++++++++++++++++++--
>>   4 files changed, 150 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 6138d10..246b2d4 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -33,12 +33,16 @@ struct mm_struct;
>>   struct memcg_kmem_cache {
>>   	struct kmem_cache *cache;
>>   	struct work_struct destroy;
>> +	struct list_head lru;
>> +	u32 nr_objects;
>>   	struct mem_cgroup *memcg; /* Should be able to do without this */
>>   };
>>
>>   struct memcg_cache_struct {
>>   	int index;
>>   	struct kmem_cache *cache;
>> +	int (*shrink_fn)(struct shrinker *shrink, struct shrink_control *sc);
>> +	struct shrinker shrink;
>>   };
>>
>>   enum memcg_cache_indexes {
>> @@ -53,6 +57,7 @@ struct mem_cgroup *memcg_from_shrinker(struct shrinker *s);
>>   struct memcg_kmem_cache *memcg_cache_get(struct mem_cgroup *memcg, int index);
>>   void register_memcg_cache(struct memcg_cache_struct *cache);
>>   void memcg_slab_destroy(struct kmem_cache *cache, struct mem_cgroup *memcg);
>> +bool memcg_slab_reclaim(struct mem_cgroup *memcg);
>>
>>   struct kmem_cache *
>>   kmem_cache_dup(struct mem_cgroup *memcg, struct kmem_cache *base);
>> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
>> index 07ceb97..11efdba 100644
>> --- a/include/linux/shrinker.h
>> +++ b/include/linux/shrinker.h
>> @@ -1,6 +1,7 @@
>>   #ifndef _LINUX_SHRINKER_H
>>   #define _LINUX_SHRINKER_H
>>
>> +struct mem_cgroup;
>>   /*
>>    * This struct is used to pass information from page reclaim to the shrinkers.
>>    * We consolidate the values for easier extention later.
>> @@ -10,6 +11,7 @@ struct shrink_control {
>>
>>   	/* How many slab objects shrinker() should scan and try to reclaim */
>>   	unsigned long nr_to_scan;
>> +	struct mem_cgroup *memcg;
>>   };
>>
>>   /*
>> @@ -40,4 +42,6 @@ struct shrinker {
>>   #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
>>   extern void register_shrinker(struct shrinker *);
>>   extern void unregister_shrinker(struct shrinker *);
>> +
>> +extern void register_shrinker_memcg(struct shrinker *);
>>   #endif
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 1b1db88..9c89a3c 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3460,6 +3460,54 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>   	return ret;
>>   }
>>
>> +static int mem_cgroup_resize_kmem_limit(struct mem_cgroup *memcg,
>> +					unsigned long long val)
>> +{
>> +
>> +	int retry_count;
>> +	int ret = 0;
>> +	int children = mem_cgroup_count_children(memcg);
>> +	u64 curusage, oldusage;
>> +
>> +	struct shrink_control shrink = {
>> +		.gfp_mask = GFP_KERNEL,
>> +		.memcg = memcg,
>> +	};
>> +
>> +	/*
>> +	 * For keeping hierarchical_reclaim simple, how long we should retry
>> +	 * is depends on callers. We set our retry-count to be function
>> +	 * of # of children which we should visit in this loop.
>> +	 */
>> +	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
>> +
>> +	oldusage = res_counter_read_u64(&memcg->kmem, RES_USAGE);
>> +
>> +	while (retry_count) {
>> +		if (signal_pending(current)) {
>> +			ret = -EINTR;
>> +			break;
>> +		}
>> +		mutex_lock(&set_limit_mutex);
>> +		ret = res_counter_set_limit(&memcg->kmem, val);
>> +		mutex_unlock(&set_limit_mutex);
>> +		if (!ret)
>> +			break;
>> +
>> +		shrink_slab(&shrink, 0, 0);
>> +
>> +		curusage = res_counter_read_u64(&memcg->kmem, RES_USAGE);
>> +
>> +		/* Usage is reduced ? */
>> +		if (curusage>= oldusage)
>> +			retry_count--;
>> +		else
>> +			oldusage = curusage;
>> +	}
>> +	return ret;
>> +
>> +}
>> +
>>   static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>>   					unsigned long long val)
>>   {
>> @@ -3895,13 +3943,17 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
>>   			break;
>>   		if (type == _MEM)
>>   			ret = mem_cgroup_resize_limit(memcg, val);
>> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
>>   		else if (type == _KMEM) {
>>   			if (!memcg->kmem_independent_accounting) {
>>   				ret = -EINVAL;
>>   				break;
>>   			}
>> -			ret = res_counter_set_limit(&memcg->kmem, val);
>> -		} else
>> +
>> +			ret = mem_cgroup_resize_kmem_limit(memcg, val);
>> +		}
>> +#endif
>> +		else
>>   			ret = mem_cgroup_resize_memsw_limit(memcg, val);
>>   		break;
>>   	case RES_SOFT_LIMIT:
>> @@ -5007,9 +5059,19 @@ struct memcg_kmem_cache *memcg_cache_get(struct mem_cgroup *memcg, int index)
>>
>>   void register_memcg_cache(struct memcg_cache_struct *cache)
>>   {
>> +	struct shrinker *shrink;
>> +
>>   	BUG_ON(kmem_avail_caches[cache->index]);
>>
>>   	kmem_avail_caches[cache->index] = cache;
>> +	if (!kmem_avail_caches[cache->index]->shrink_fn)
>> +		return;
>> +
>> +	shrink =&kmem_avail_caches[cache->index]->shrink;
>> +	shrink->seeks = DEFAULT_SEEKS;
>> +	shrink->shrink = kmem_avail_caches[cache->index]->shrink_fn;
>> +	shrink->batch = 1024;
>> +	register_shrinker_memcg(shrink);
>>   }
>>
>>   #define memcg_kmem(memcg) \
>> @@ -5055,8 +5117,21 @@ int memcg_kmem_newpage(struct mem_cgroup *memcg, struct page *page, unsigned lon
>>   {
>>   	unsigned long size = pages<<  PAGE_SHIFT;
>>   	struct res_counter *fail;
>> +	int ret;
>> +	bool do_softlimit;
>> +
>> +	ret = res_counter_charge(memcg_kmem(memcg), size,&fail);
>> +	if (unlikely(mem_cgroup_event_ratelimit(memcg,
>> +						MEM_CGROUP_TARGET_THRESH))) {
>> +
>> +		do_softlimit = mem_cgroup_event_ratelimit(memcg,
>> +						MEM_CGROUP_TARGET_SOFTLIMIT);
>> +		mem_cgroup_threshold(memcg);
>> +		if (unlikely(do_softlimit))
>> +			mem_cgroup_update_tree(memcg, page);
>> +	}
>
> Do we need to have this hook here ?
> (BTW, please don't duplicate...)
>
>
>>
>> -	return res_counter_charge(memcg_kmem(memcg), size,&fail);
>> +	return ret;
>>   }
>>
>>   void memcg_kmem_freepage(struct mem_cgroup *memcg, struct page *page, unsigned long pages)
>> @@ -5083,6 +5158,7 @@ void memcg_create_kmem_caches(struct mem_cgroup *memcg)
>>   		else
>>   			memcg->kmem_cache[i].cache = kmem_cache_dup(memcg, cache);
>>   		INIT_WORK(&memcg->kmem_cache[i].destroy, memcg_cache_destroy);
>> +		INIT_LIST_HEAD(&memcg->kmem_cache[i].lru);
>>   		memcg->kmem_cache[i].memcg = memcg;
>>   	}
>>   }
>> @@ -5157,6 +5233,11 @@ free_out:
>>   	return ERR_PTR(error);
>>   }
>>
>> +bool memcg_slab_reclaim(struct mem_cgroup *memcg)
>> +{
>> +	return !memcg->kmem_independent_accounting;
>> +}
>> +
>>   void memcg_slab_destroy(struct kmem_cache *cache, struct mem_cgroup *memcg)
>>   {
>>   	int i;
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index c52b235..b9bceb6 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -159,6 +159,23 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
>>   static LIST_HEAD(shrinker_list);
>>   static DECLARE_RWSEM(shrinker_rwsem);
>>
>> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
>> +/*
>> + * If we could guarantee the root mem cgroup will always exist, we could just
>> + * use the normal shrinker_list, and assume that the root memcg is passed
>> + * as a parameter. But we're not quite there yet. Because of that, the shinkers
>> + * from the memcg case can be different from the normal shrinker for the same
>> + * object. This is not the ideal situation but is a step towards that.
>> + *
>> + * Also, not all caches will have their memcg version (also likely to change),
>> + * so scanning the whole list is a waste.
>> + *
>> + * I am using, however, the same lock for both lists. Updates to it should
>> + * be unfrequent, so I don't expect that to generate contention
>> + */
>> +static LIST_HEAD(shrinker_memcg_list);
>> +#endif
>> +
>>   #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>>   static bool global_reclaim(struct scan_control *sc)
>>   {
>> @@ -169,6 +186,11 @@ static bool scanning_global_lru(struct mem_cgroup_zone *mz)
>>   {
>>   	return !mz->mem_cgroup;
>>   }
>> +
>> +static bool global_slab_reclaim(struct scan_control *sc)
>> +{
>> +	return !memcg_slab_reclaim(sc->target_mem_cgroup);
>> +}
>
> Do we need this new function ? global_reclaim() isn't enough ?

When we're tracking kmem separately, yes, because the softlimit happens 
on a different counter.

However, I'll try to merge it somehow with the normal code in the next 
run, and I'll keep this in mind. Let's see how it ends up...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2012-02-22 14:54 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-21 11:34 [PATCH 0/7] memcg kernel memory tracking Glauber Costa
2012-02-21 11:34 ` [PATCH 1/7] small cleanup for memcontrol.c Glauber Costa
2012-02-22  0:46   ` KAMEZAWA Hiroyuki
2012-02-22 14:01     ` Glauber Costa
2012-02-29 17:30       ` Glauber Costa
2012-03-01  2:11         ` KAMEZAWA Hiroyuki
2012-02-21 11:34 ` [PATCH 2/7] Basic kernel memory functionality for the Memory Controller Glauber Costa
2012-02-21 11:34 ` [PATCH 3/7] per-cgroup slab caches Glauber Costa
2012-02-21 23:50   ` Suleiman Souhlal
2012-02-22 14:08     ` Glauber Costa
2012-02-22  1:21   ` KAMEZAWA Hiroyuki
2012-02-22 14:25     ` Glauber Costa
2012-02-21 11:34 ` [PATCH 4/7] chained slab caches: move pages to a different cache when a cache is destroyed Glauber Costa
2012-02-21 23:40   ` Suleiman Souhlal
2012-02-22 14:50     ` Glauber Costa
2012-02-22  1:25   ` KAMEZAWA Hiroyuki
2012-02-22 14:57     ` Glauber Costa
2012-02-21 11:34 ` [PATCH 5/7] shrink support for memcg kmem controller Glauber Costa
2012-02-21 23:35   ` Suleiman Souhlal
2012-02-22 14:00     ` Glauber Costa
2012-02-22  1:42   ` KAMEZAWA Hiroyuki
2012-02-22 14:53     ` Glauber Costa [this message]
2012-02-21 11:34 ` [PATCH 6/7] track dcache per-memcg Glauber Costa
2012-02-21 11:34 ` [PATCH 7/7] example shrinker for memcg-aware dcache Glauber Costa
2012-02-21 23:25 ` [PATCH 0/7] memcg kernel memory tracking Suleiman Souhlal
2012-02-22 13:58   ` Glauber Costa
2012-02-22 20:32     ` Suleiman Souhlal
2012-02-22  7:08 ` Pekka Enberg
2012-02-22 14:11   ` Glauber Costa
2012-02-23 18:18 ` Ying Han
2012-02-28 19:02   ` Glauber Costa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4F450165.6090204@parallels.com \
    --to=glommer@parallels.com \
    --cc=cgroups@vger.kernel.org \
    --cc=cl@linux.com \
    --cc=devel@openvz.org \
    --cc=fweisbec@gmail.com \
    --cc=gthelen@google.com \
    --cc=jweiner@redhat.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kirill@shutemov.name \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.cz \
    --cc=penberg@kernel.org \
    --cc=pjt@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox