linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Johannes Weiner <hannes@cmpxchg.org>
To: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: linux-mm@kvack.org, cgroups@vger.kernel.org,
	akpm@linux-foundation.org, Tejun Heo <tj@kernel.org>,
	Jiufei Xue <jiufei.xue@linux.alibaba.com>,
	Caspar Zhang <caspar@linux.alibaba.com>
Subject: Re: [RFC PATCH 2/3] psi: cgroup v1 support
Date: Tue, 4 Jun 2019 07:55:19 -0400	[thread overview]
Message-ID: <20190604115519.GA18545@cmpxchg.org> (raw)
In-Reply-To: <20190604015745.78972-3-joseph.qi@linux.alibaba.com>

On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
> Implements pressure stall tracking for cgroup v1.
> 
> Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
> ---
>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 7acc632c3b82..909083c828d5 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>  	return state_mask;
>  }
>  
> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
> +{
> +	switch (res) {
> +	case NR_PSI_RESOURCES:
> +		return task_dfl_cgroup(task);
> +	case PSI_IO:
> +		return task_cgroup(task, io_cgrp_subsys.id);
> +	case PSI_MEM:
> +		return task_cgroup(task, memory_cgrp_subsys.id);
> +	case PSI_CPU:
> +		return task_cgroup(task, cpu_cgrp_subsys.id);
> +	default:  /* won't reach here */
> +		return NULL;
> +	}
> +}
> +
> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
> +					enum psi_res res)
>  {
>  #ifdef CONFIG_CGROUPS
>  	struct cgroup *cgroup = NULL;
>  
>  	if (!*iter)
> -		cgroup = task->cgroups->dfl_cgrp;
> +		cgroup = psi_task_cgroup(task, res);
>  	else if (*iter == &psi_system)
>  		return NULL;
>  	else
> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>  		     wq_worker_last_func(task) == psi_avgs_work))
>  		wake_clock = false;
>  
> -	while ((group = iterate_groups(task, &iter))) {
> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>  
> -		if (state_mask & group->poll_states)
> -			psi_schedule_poll_work(group, 1);
> +			if (state_mask & group->poll_states)
> +				psi_schedule_poll_work(group, 1);
>  
> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +		}
> +	} else {
> +		enum psi_task_count i;
> +		enum psi_res res;
> +		int psi_flags = clear | set;
> +
> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
> +				res = PSI_IO;
> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
> +				res = PSI_MEM;
> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
> +				res = PSI_CPU;
> +			else
> +				continue;
> +
> +			while ((group = iterate_groups(task, &iter, res))) {
> +				u32 state_mask = psi_group_change(group, cpu, clear, set);

This doesn't work. Each resource state is composed of all possible
task states:

static bool test_state(unsigned int *tasks, enum psi_states state)
{
	switch (state) {
	case PSI_IO_SOME:
		return tasks[NR_IOWAIT];
	case PSI_IO_FULL:
		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
	case PSI_MEM_SOME:
		return tasks[NR_MEMSTALL];
	case PSI_MEM_FULL:
		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
	case PSI_CPU_SOME:
		return tasks[NR_RUNNING] > 1;
	case PSI_NONIDLE:
		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
			tasks[NR_RUNNING];
	default:
		return false;
	}
}

So the IO controller needs to know of NR_RUNNING to tell some vs full,
the memory controller needs to know of NR_IOWAIT to tell nonidle etc.

You need to run the full psi task tracking and aggregation machinery
separately for each of the different cgroups a task can be in in v1.

Needless to say, that is expensive. For cpu, memory and io, it's
triple the scheduling overhead with three ancestor walks and three
times the cache footprint; three times more aggregation workers every
two seconds... We could never turn this on per default.

Have you considered just co-mounting cgroup2, if for nothing else, to
get the pressure numbers?


  reply	other threads:[~2019-06-04 11:55 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-04  1:57 [RFC PATCH 0/3] psi: support cgroup v1 Joseph Qi
2019-06-04  1:57 ` [RFC PATCH 1/3] psi: make cgroup psi helpers public Joseph Qi
2019-06-04  1:57 ` [RFC PATCH 2/3] psi: cgroup v1 support Joseph Qi
2019-06-04 11:55   ` Johannes Weiner [this message]
2019-06-05  1:15     ` Joseph Qi
2019-06-04  1:57 ` [RFC PATCH 3/3] psi: add cgroup v1 interfaces Joseph Qi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190604115519.GA18545@cmpxchg.org \
    --to=hannes@cmpxchg.org \
    --cc=akpm@linux-foundation.org \
    --cc=caspar@linux.alibaba.com \
    --cc=cgroups@vger.kernel.org \
    --cc=jiufei.xue@linux.alibaba.com \
    --cc=joseph.qi@linux.alibaba.com \
    --cc=linux-mm@kvack.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox