Re: [PATCH v4 4/4] memcg: implement memory thresholds

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: containers@lists.linux-foundation.org, linux-mm@kvack.org,
	Paul Menage <menage@google.com>, Li Zefan <lizf@cn.fujitsu.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Pavel Emelyanov <xemul@openvz.org>,
	Dan Malek <dan@embeddedalley.com>,
	Vladislav Buzov <vbuzov@embeddedalley.com>,
	Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
	Alexander Shishkin <virtuoso@slind.org>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v4 4/4] memcg: implement memory thresholds
Date: Mon, 28 Dec 2009 13:14:40 +0900	[thread overview]
Message-ID: <20091228131440.3a49a943.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <cc557aab0912271923v4a4ed8cco168193c63efd44f@mail.gmail.com>

On Mon, 28 Dec 2009 05:23:51 +0200
"Kirill A. Shutemov" <kirill@shutemov.name> wrote:

> On Mon, Dec 28, 2009 at 4:43 AM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Sun, 27 Dec 2009 04:09:02 +0200
> > "Kirill A. Shutemov" <kirill@shutemov.name> wrote:
A /*
> >> A  * Statistics for memory cgroup.
> >> @@ -72,6 +79,8 @@ enum mem_cgroup_stat_index {
> >> A  A  A  MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
> >> A  A  A  MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
> >> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  used by soft limit implementation */
> >> + A  A  MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
> >> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  used by threshold implementation */
> >>
> >> A  A  A  MEM_CGROUP_STAT_NSTATS,
> >> A };
> >> @@ -182,6 +191,20 @@ struct mem_cgroup_tree {
> >>
> >> A static struct mem_cgroup_tree soft_limit_tree __read_mostly;
> >>
> >> +struct mem_cgroup_threshold {
> >> + A  A  struct eventfd_ctx *eventfd;
> >> + A  A  u64 threshold;
> >> +};
> >> +
> >> +struct mem_cgroup_threshold_ary {
> >> + A  A  unsigned int size;
> >> + A  A  atomic_t cur;
> >> + A  A  struct mem_cgroup_threshold entries[0];
> >> +};
> >> +
> > Why "array" is a choice here ? IOW, why not list ?
> 
> We need be able to walk by thresholds in both directions to be fast.
> AFAIK, It's impossible with RCU-protected list.
> 
I couldn't read your code correctly. Could you add a comment on

  atomic_t cur; /* An array index points to XXXXX */

or use better name ?

> > How many waiters are expected as usual workload ?
> 
> Array of thresholds reads every 100 page in/out for every CPU.
> Write access only when registering new threshold.
> 



> >> +static bool mem_cgroup_threshold_check(struct mem_cgroup* mem);
> >> +static void mem_cgroup_threshold(struct mem_cgroup* mem);
> >> +
> >> A /*
> >> A  * The memory controller data structure. The memory controller controls both
> >> A  * page cache and RSS per cgroup. We would eventually like to provide
> >> @@ -233,6 +256,15 @@ struct mem_cgroup {
> >> A  A  A  /* set when res.limit == memsw.limit */
> >> A  A  A  bool A  A  A  A  A  A memsw_is_minimum;
> >>
> >> + A  A  /* protect arrays of thresholds */
> >> + A  A  struct mutex thresholds_lock;
> >> +
> >> + A  A  /* thresholds for memory usage. RCU-protected */
> >> + A  A  struct mem_cgroup_threshold_ary *thresholds;
> >> +
> >> + A  A  /* thresholds for mem+swap usage. RCU-protected */
> >> + A  A  struct mem_cgroup_threshold_ary *memsw_thresholds;
> >> +
> >> A  A  A  /*
> >> A  A  A  A * statistics. This must be placed at the end of memcg.
> >> A  A  A  A */
> >> @@ -525,6 +557,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
> >> A  A  A  A  A  A  A  __mem_cgroup_stat_add_safe(cpustat,
> >> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
> >> A  A  A  __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
> >> + A  A  __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
> >> +
> >> A  A  A  put_cpu();
> >> A }
> >>
> >> @@ -1510,6 +1544,8 @@ charged:
> >> A  A  A  if (mem_cgroup_soft_limit_check(mem))
> >> A  A  A  A  A  A  A  mem_cgroup_update_tree(mem, page);
> >> A done:
> >> + A  A  if (mem_cgroup_threshold_check(mem))
> >> + A  A  A  A  A  A  mem_cgroup_threshold(mem);
> >> A  A  A  return 0;
> >> A nomem:
> >> A  A  A  css_put(&mem->css);
> >> @@ -2075,6 +2111,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
> >>
> >> A  A  A  if (mem_cgroup_soft_limit_check(mem))
> >> A  A  A  A  A  A  A  mem_cgroup_update_tree(mem, page);
> >> + A  A  if (mem_cgroup_threshold_check(mem))
> >> + A  A  A  A  A  A  mem_cgroup_threshold(mem);
> >> A  A  A  /* at swapout, this memcg will be accessed to record to swap */
> >> A  A  A  if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> >> A  A  A  A  A  A  A  css_put(&mem->css);
> >> @@ -3071,12 +3109,246 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
> >> A  A  A  return 0;
> >> A }
> >>
> >> +static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
> >> +{
> >> + A  A  bool ret = false;
> >> + A  A  int cpu;
> >> + A  A  s64 val;
> >> + A  A  struct mem_cgroup_stat_cpu *cpustat;
> >> +
> >> + A  A  cpu = get_cpu();
> >> + A  A  cpustat = &mem->stat.cpustat[cpu];
> >> + A  A  val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
> >> + A  A  if (unlikely(val < 0)) {
> >> + A  A  A  A  A  A  __mem_cgroup_stat_set(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
> >> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  THRESHOLDS_EVENTS_THRESH);
> >> + A  A  A  A  A  A  ret = true;
> >> + A  A  }
> >> + A  A  put_cpu();
> >> + A  A  return ret;
> >> +}
> >> +
> >> +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
> >> +{
> >> + A  A  struct mem_cgroup_threshold_ary *thresholds;
> >> + A  A  u64 usage = mem_cgroup_usage(memcg, swap);
> >> + A  A  int i, cur;
> >> +
> >> + A  A  rcu_read_lock();
> >> + A  A  if (!swap) {
> >> + A  A  A  A  A  A  thresholds = rcu_dereference(memcg->thresholds);
> >> + A  A  } else {
> >> + A  A  A  A  A  A  thresholds = rcu_dereference(memcg->memsw_thresholds);
> >> + A  A  }
> >> +
> >> + A  A  if (!thresholds)
> >> + A  A  A  A  A  A  goto unlock;
> >> +
> >> + A  A  cur = atomic_read(&thresholds->cur);
> >> +
> >> + A  A  /* Check if a threshold crossed in any direction */
> >> +
> >> + A  A  for(i = cur; i >= 0 &&
> >> + A  A  A  A  A  A  unlikely(thresholds->entries[i].threshold > usage); i--) {
> >> + A  A  A  A  A  A  atomic_dec(&thresholds->cur);
> >> + A  A  A  A  A  A  eventfd_signal(thresholds->entries[i].eventfd, 1);
> >> + A  A  }
> >> +
> >> + A  A  for(i = cur + 1; i < thresholds->size &&
> >> + A  A  A  A  A  A  unlikely(thresholds->entries[i].threshold <= usage); i++) {
> >> + A  A  A  A  A  A  atomic_inc(&thresholds->cur);
> >> + A  A  A  A  A  A  eventfd_signal(thresholds->entries[i].eventfd, 1);
> >> + A  A  }

Could you add explanation here ?

> >> +unlock:
> >> + A  A  rcu_read_unlock();
> >> +}
> >> +
> >> +static void mem_cgroup_threshold(struct mem_cgroup *memcg)
> >> +{
> >> + A  A  __mem_cgroup_threshold(memcg, false);
> >> + A  A  if (do_swap_account)
> >> + A  A  A  A  A  A  __mem_cgroup_threshold(memcg, true);
> >> +}
> >> +
> >> +static int compare_thresholds(const void *a, const void *b)
> >> +{
> >> + A  A  const struct mem_cgroup_threshold *_a = a;
> >> + A  A  const struct mem_cgroup_threshold *_b = b;
> >> +
> >> + A  A  return _a->threshold - _b->threshold;
> >> +}
> >> +
> >> +static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
> >> + A  A  A  A  A  A  struct eventfd_ctx *eventfd, const char *args)
> >> +{
> >> + A  A  struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> >> + A  A  struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
> >> + A  A  int type = MEMFILE_TYPE(cft->private);
> >> + A  A  u64 threshold, usage;
> >> + A  A  int size;
> >> + A  A  int i, ret;
> >> +
> >> + A  A  ret = res_counter_memparse_write_strategy(args, &threshold);
> >> + A  A  if (ret)
> >> + A  A  A  A  A  A  return ret;
> >> +
> >> + A  A  mutex_lock(&memcg->thresholds_lock);
> >> + A  A  if (type == _MEM)
> >> + A  A  A  A  A  A  thresholds = memcg->thresholds;
> >> + A  A  else if (type == _MEMSWAP)
> >> + A  A  A  A  A  A  thresholds = memcg->memsw_thresholds;
> >> + A  A  else
> >> + A  A  A  A  A  A  BUG();
> >> +
> >> + A  A  usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
> >> +
> >> + A  A  /* Check if a threshold crossed before adding a new one */
> >> + A  A  if (thresholds)
> >> + A  A  A  A  A  A  __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> >> +
> >> + A  A  if (thresholds)
> >> + A  A  A  A  A  A  size = thresholds->size + 1;
> >> + A  A  else
> >> + A  A  A  A  A  A  size = 1;
> >> +
> >> + A  A  /* Allocate memory for new array of thresholds */
> >> + A  A  thresholds_new = kmalloc(sizeof(*thresholds_new) +
> >> + A  A  A  A  A  A  A  A  A  A  size * sizeof(struct mem_cgroup_threshold),
> >> + A  A  A  A  A  A  A  A  A  A  GFP_KERNEL);
> >> + A  A  if (!thresholds_new) {
> >> + A  A  A  A  A  A  ret = -ENOMEM;
> >> + A  A  A  A  A  A  goto unlock;
> >> + A  A  }
> >> + A  A  thresholds_new->size = size;
> >> +
> >> + A  A  /* Copy thresholds (if any) to new array */
> >> + A  A  if (thresholds)
> >> + A  A  A  A  A  A  memcpy(thresholds_new->entries, thresholds->entries,
> >> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  thresholds->size *
> >> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  sizeof(struct mem_cgroup_threshold));
> >> + A  A  /* Add new threshold */
> >> + A  A  thresholds_new->entries[size - 1].eventfd = eventfd;
> >> + A  A  thresholds_new->entries[size - 1].threshold = threshold;
> >> +
> >> + A  A  /* Sort thresholds. Registering of new threshold isn't time-critical */
> >> + A  A  sort(thresholds_new->entries, size,
> >> + A  A  A  A  A  A  A  A  A  A  sizeof(struct mem_cgroup_threshold),
> >> + A  A  A  A  A  A  A  A  A  A  compare_thresholds, NULL);
> >> +
> >> + A  A  /* Find current threshold */
> >> + A  A  atomic_set(&thresholds_new->cur, -1);
> >> + A  A  for(i = 0; i < size; i++) {
> >> + A  A  A  A  A  A  if (thresholds_new->entries[i].threshold < usage)
> >> + A  A  A  A  A  A  A  A  A  A  atomic_inc(&thresholds_new->cur);
> >> + A  A  }
> >> +
> >> + A  A  /*
> >> + A  A  A * We need to increment refcnt to be sure that all thresholds
> >> + A  A  A * will be unregistered before calling __mem_cgroup_free()
> >> + A  A  A */
> >> + A  A  mem_cgroup_get(memcg);
> >> +
> >> + A  A  if (type == _MEM)
> >> + A  A  A  A  A  A  rcu_assign_pointer(memcg->thresholds, thresholds_new);
> >> + A  A  else
> >> + A  A  A  A  A  A  rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
> >> +
> >> + A  A  synchronize_rcu();
> >
> > Could you add explanation when you use synchronize_rcu() ?
> 
> It uses before freeing old array of thresholds to be sure than nobody uses it.
> 
> >> + A  A  kfree(thresholds);
> >
> > Can't this be freed by RCU instead of synchronize_rcu() ?
> 
> Yes, this can. But I don't think that (un)registering os thresholds is
> time critical.
> I think my variant is more clean.
> 
I don't ;) But ok, this is a nitpick. Ignore me but add an explanation
commentary in codes.



> >> +unlock:
> >> + A  A  mutex_unlock(&memcg->thresholds_lock);
> >> +
> >> + A  A  return ret;
> >> +}
> >> +
> >> +static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
> >> + A  A  A  A  A  A  struct eventfd_ctx *eventfd)
> >> +{
> >> + A  A  struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> >> + A  A  struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
> >> + A  A  int type = MEMFILE_TYPE(cft->private);
> >> + A  A  u64 usage;
> >> + A  A  int size = 0;
> >> + A  A  int i, j, ret;
> >> +
> >> + A  A  mutex_lock(&memcg->thresholds_lock);
> >> + A  A  if (type == _MEM)
> >> + A  A  A  A  A  A  thresholds = memcg->thresholds;
> >> + A  A  else if (type == _MEMSWAP)
> >> + A  A  A  A  A  A  thresholds = memcg->memsw_thresholds;
> >> + A  A  else
> >> + A  A  A  A  A  A  BUG();
> >> +
> >> + A  A  /*
> >> + A  A  A * Something went wrong if we trying to unregister a threshold
> >> + A  A  A * if we don't have thresholds
> >> + A  A  A */
> >> + A  A  BUG_ON(!thresholds);
> >> +
> >> + A  A  usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
> >> +
> >> + A  A  /* Check if a threshold crossed before removing */
> >> + A  A  __mem_cgroup_threshold(memcg, type == _MEMSWAP);
> >> +
> >> + A  A  /* Calculate new number of threshold */
> >> + A  A  for(i = 0; i < thresholds->size; i++) {
> >> + A  A  A  A  A  A  if (thresholds->entries[i].eventfd != eventfd)
> >> + A  A  A  A  A  A  A  A  A  A  size++;
> >> + A  A  }
> >> +
> >> + A  A  /* Set thresholds array to NULL if we don't have thresholds */
> >> + A  A  if (!size) {
> >> + A  A  A  A  A  A  thresholds_new = NULL;
> >> + A  A  A  A  A  A  goto assign;
> >> + A  A  }
> >> +
> >> + A  A  /* Allocate memory for new array of thresholds */
> >> + A  A  thresholds_new = kmalloc(sizeof(*thresholds_new) +
> >> + A  A  A  A  A  A  A  A  A  A  size * sizeof(struct mem_cgroup_threshold),
> >> + A  A  A  A  A  A  A  A  A  A  GFP_KERNEL);
> >> + A  A  if (!thresholds_new) {
> >> + A  A  A  A  A  A  ret = -ENOMEM;
> >> + A  A  A  A  A  A  goto unlock;
> >> + A  A  }
> >> + A  A  thresholds_new->size = size;
> >> +
> >> + A  A  /* Copy thresholds and find current threshold */
> >> + A  A  atomic_set(&thresholds_new->cur, -1);
> >> + A  A  for(i = 0, j = 0; i < thresholds->size; i++) {
> >> + A  A  A  A  A  A  if (thresholds->entries[i].eventfd == eventfd)
> >> + A  A  A  A  A  A  A  A  A  A  continue;
> >> +
> >> + A  A  A  A  A  A  thresholds_new->entries[j] = thresholds->entries[i];
> >> + A  A  A  A  A  A  if (thresholds_new->entries[j].threshold < usage)
> >> + A  A  A  A  A  A  A  A  A  A  atomic_inc(&thresholds_new->cur);
> > It's better to do atomic set after loop.
> 
> We need one more counter to do this. Do you like it?
> 
Please add a comment that "cur" is for what or use better name. 
Honestly, I don't understand fully how "cur" moves. I'm not sure
whether updating at insert/delete is really necessary or not.


> >> + A  A  A  A  A  A  j++;
> >> + A  A  }
> >
> > Hmm..is this "copy array" usual coding style for handling eventfd ?
> 
> Since we store only pointer to struct eventfd_ctx, I don't see a problem.
> 
Following is just an suggestion after brief look...

IMO, "cur" is not necessary in the 1st version.
Using simple list and do full-scan always will be good as first step.
(And do necessary optimization later.)
Then, size of patch will be dramatically small.

I think the "cur" magic complicates details too much.


Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2009-12-28  4:18 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-27  2:08 [PATCH v4 0/4] cgroup notifications API and " Kirill A. Shutemov
2009-12-27  2:08 ` [PATCH v4 1/4] cgroup: implement eventfd-based generic API for notifications Kirill A. Shutemov
2009-12-27  2:09   ` [PATCH v4 2/4] memcg: extract mem_group_usage() from mem_cgroup_read() Kirill A. Shutemov
2009-12-27  2:09     ` [PATCH v4 3/4] memcg: rework usage of stats by soft limit Kirill A. Shutemov
2009-12-27  2:09       ` [PATCH v4 4/4] memcg: implement memory thresholds Kirill A. Shutemov
2009-12-28  2:43         ` KAMEZAWA Hiroyuki
2009-12-28  3:23           ` Kirill A. Shutemov
2009-12-28  4:14             ` KAMEZAWA Hiroyuki [this message]
2009-12-28  4:42         ` Daisuke Nishimura
2009-12-30 13:03           ` Kirill A. Shutemov
2009-12-28  2:28       ` [PATCH v4 3/4] memcg: rework usage of stats by soft limit KAMEZAWA Hiroyuki
2009-12-28  2:37       ` Daisuke Nishimura
2009-12-28  2:30     ` [PATCH v4 2/4] memcg: extract mem_group_usage() from mem_cgroup_read() KAMEZAWA Hiroyuki
2009-12-28  2:31   ` [PATCH v4 1/4] cgroup: implement eventfd-based generic API for notifications KAMEZAWA Hiroyuki
2009-12-27 12:47 ` [PATCH v4 0/4] cgroup notifications API and memory thresholds Balbir Singh
2009-12-27 18:37   ` Kirill A. Shutemov
2010-01-04  0:15     ` Balbir Singh
2009-12-28  2:27 ` KAMEZAWA Hiroyuki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091228131440.3a49a943.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=containers@lists.linux-foundation.org \
    --cc=dan@embeddedalley.com \
    --cc=kirill@shutemov.name \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizf@cn.fujitsu.com \
    --cc=menage@google.com \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=vbuzov@embeddedalley.com \
    --cc=virtuoso@slind.org \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox