linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Balbir Singh <balbir@linux.vnet.ibm.com>
To: Ying Han <yinghan@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mel@csn.ul.ie>, Johannes Weiner <hannes@cmpxchg.org>,
	Christoph Lameter <cl@linux.com>,
	Wu Fengguang <fengguang.wu@intel.com>,
	Andi Kleen <ak@linux.intel.com>, Hugh Dickins <hughd@google.com>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Tejun Heo <tj@kernel.org>,
	linux-mm@kvack.org
Subject: Re: [PATCH 1/4] Add kswapd descriptor.
Date: Tue, 7 Dec 2010 12:22:42 +0530	[thread overview]
Message-ID: <20101207065242.GI3158@balbir.in.ibm.com> (raw)
In-Reply-To: <1291099785-5433-2-git-send-email-yinghan@google.com>

* Ying Han <yinghan@google.com> [2010-11-29 22:49:42]:

> There is a kswapd kernel thread for each memory node. We add a different kswapd
> for each cgroup.

Could you please elaborate on this, what is adding? creating a thread?

The kswapd is sleeping in the wait queue headed at kswapd_wait
> field of a kswapd descriptor. The kswapd descriptor stores information of node
> or cgroup and it allows the global and per cgroup background reclaim to share
> common reclaim algorithms.
> 
> This patch addes the kswapd descriptor and changes per zone kswapd_wait to the
> common data structure.
> 
> Signed-off-by: Ying Han <yinghan@google.com>
> ---

The performance data you posted earlier is helpful, do you have any
additional insights on the the CPU overheads if any?

My general overall comment is that this patch needs to be refactored
to bring out the change the patch makes.

>  include/linux/mmzone.h |    3 +-
>  include/linux/swap.h   |   10 +++++
>  mm/memcontrol.c        |    2 +
>  mm/mmzone.c            |    2 +-
>  mm/page_alloc.c        |    9 +++-
>  mm/vmscan.c            |   98 +++++++++++++++++++++++++++++++++--------------
>  6 files changed, 90 insertions(+), 34 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 39c24eb..c77dfa2 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -642,8 +642,7 @@ typedef struct pglist_data {
>  	unsigned long node_spanned_pages; /* total size of physical page
>  					     range, including holes */
>  	int node_id;
> -	wait_queue_head_t kswapd_wait;
> -	struct task_struct *kswapd;
> +	wait_queue_head_t *kswapd_wait;
>  	int kswapd_max_order;
>  } pg_data_t;
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index eba53e7..2e6cb58 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -26,6 +26,16 @@ static inline int current_is_kswapd(void)
>  	return current->flags & PF_KSWAPD;
>  }
> 
> +struct kswapd {
> +	struct task_struct *kswapd_task;
> +	wait_queue_head_t kswapd_wait;
> +	struct mem_cgroup *kswapd_mem;

Is this field being used anywhere in this patch?

> +	pg_data_t *kswapd_pgdat;
> +};
> +
> +#define MAX_KSWAPDS MAX_NUMNODES
> +extern struct kswapd kswapds[MAX_KSWAPDS];
> +int kswapd(void *p);
>  /*
>   * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
>   * be swapped to.  The swap type and the offset into that swap type are
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a4034b6..dca3590 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -263,6 +263,8 @@ struct mem_cgroup {
>  	 */
>  	struct mem_cgroup_stat_cpu nocpu_base;
>  	spinlock_t pcp_counter_lock;
> +
> +	wait_queue_head_t *kswapd_wait;
>  };
> 
>  /* Stuffs for move charges at task migration. */
> diff --git a/mm/mmzone.c b/mm/mmzone.c
> index e35bfb8..c7cbed5 100644
> --- a/mm/mmzone.c
> +++ b/mm/mmzone.c
> @@ -102,7 +102,7 @@ unsigned long zone_nr_free_pages(struct zone *zone)
>  	 * free pages are low, get a better estimate for free pages
>  	 */
>  	if (nr_free_pages < zone->percpu_drift_mark &&
> -			!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
> +			!waitqueue_active(zone->zone_pgdat->kswapd_wait))
>  		return zone_page_state_snapshot(zone, NR_FREE_PAGES);
> 
>  	return nr_free_pages;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b48dea2..a15bc1c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -4070,13 +4070,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>  	int nid = pgdat->node_id;
>  	unsigned long zone_start_pfn = pgdat->node_start_pfn;
>  	int ret;
> +	struct kswapd *kswapd_p;

_p is sort of ugly, do we really need it?

> 
>  	pgdat_resize_init(pgdat);
>  	pgdat->nr_zones = 0;
> -	init_waitqueue_head(&pgdat->kswapd_wait);
>  	pgdat->kswapd_max_order = 0;
>  	pgdat_page_cgroup_init(pgdat);
> -	

Thanks for the whitspace cleanup, but I don't know if that should be
done here.

> +
> +	kswapd_p = &kswapds[nid];
> +	init_waitqueue_head(&kswapd_p->kswapd_wait);
> +	pgdat->kswapd_wait = &kswapd_p->kswapd_wait;
> +	kswapd_p->kswapd_pgdat = pgdat;
> +
>  	for (j = 0; j < MAX_NR_ZONES; j++) {
>  		struct zone *zone = pgdat->node_zones + j;
>  		unsigned long size, realsize, memmap_pages;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b8a6fdc..e08005e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2115,12 +2115,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
> 
>  	return nr_reclaimed;
>  }
> +
>  #endif
> 
> +DEFINE_SPINLOCK(kswapds_spinlock);
> +struct kswapd kswapds[MAX_KSWAPDS];
> +
>  /* is kswapd sleeping prematurely? */
> -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
> +static int sleeping_prematurely(struct kswapd *kswapd, int order,
> +				long remaining)
>  {
>  	int i;
> +	pg_data_t *pgdat = kswapd->kswapd_pgdat;
> 
>  	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
>  	if (remaining)
> @@ -2377,21 +2383,28 @@ out:
>   * If there are applications that are active memory-allocators
>   * (most normal use), this basically shouldn't matter.
>   */
> -static int kswapd(void *p)
> +int kswapd(void *p)
>  {
>  	unsigned long order;
> -	pg_data_t *pgdat = (pg_data_t*)p;
> +	struct kswapd *kswapd_p = (struct kswapd *)p;
> +	pg_data_t *pgdat = kswapd_p->kswapd_pgdat;
> +	struct mem_cgroup *mem = kswapd_p->kswapd_mem;

Do we use mem anywhere?

> +	wait_queue_head_t *wait_h = &kswapd_p->kswapd_wait;

_p, _h almost look like hungarian notation in reverse :)

>  	struct task_struct *tsk = current;
>  	DEFINE_WAIT(wait);
>  	struct reclaim_state reclaim_state = {
>  		.reclaimed_slab = 0,
>  	};
> -	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> +	const struct cpumask *cpumask;
> 
>  	lockdep_set_current_reclaim_state(GFP_KERNEL);
> 
> -	if (!cpumask_empty(cpumask))
> -		set_cpus_allowed_ptr(tsk, cpumask);
> +	if (pgdat) {
> +		BUG_ON(pgdat->kswapd_wait != wait_h);
> +		cpumask = cpumask_of_node(pgdat->node_id);
> +		if (!cpumask_empty(cpumask))
> +			set_cpus_allowed_ptr(tsk, cpumask);
> +	}
>  	current->reclaim_state = &reclaim_state;
> 
>  	/*
> @@ -2414,9 +2427,13 @@ static int kswapd(void *p)
>  		unsigned long new_order;
>  		int ret;
> 
> -		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
> -		new_order = pgdat->kswapd_max_order;
> -		pgdat->kswapd_max_order = 0;
> +		prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
> +		if (pgdat) {
> +			new_order = pgdat->kswapd_max_order;
> +			pgdat->kswapd_max_order = 0;
> +		} else
> +			new_order = 0;
> +
>  		if (order < new_order) {
>  			/*
>  			 * Don't sleep if someone wants a larger 'order'
> @@ -2428,10 +2445,12 @@ static int kswapd(void *p)
>  				long remaining = 0;
> 
>  				/* Try to sleep for a short interval */
> -				if (!sleeping_prematurely(pgdat, order, remaining)) {
> +				if (!sleeping_prematurely(kswapd_p, order,
> +							remaining)) {
>  					remaining = schedule_timeout(HZ/10);
> -					finish_wait(&pgdat->kswapd_wait, &wait);
> -					prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
> +					finish_wait(wait_h, &wait);
> +					prepare_to_wait(wait_h, &wait,
> +							TASK_INTERRUPTIBLE);
>  				}
> 
>  				/*
> @@ -2439,20 +2458,25 @@ static int kswapd(void *p)
>  				 * premature sleep. If not, then go fully
>  				 * to sleep until explicitly woken up
>  				 */
> -				if (!sleeping_prematurely(pgdat, order, remaining)) {
> -					trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
> +				if (!sleeping_prematurely(kswapd_p, order,
> +								remaining)) {
> +					if (pgdat)
> +						trace_mm_vmscan_kswapd_sleep(
> +								pgdat->node_id);
>  					schedule();
>  				} else {
>  					if (remaining)
> -						count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
> +						count_vm_event(
> +						KSWAPD_LOW_WMARK_HIT_QUICKLY);
>  					else
> -						count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
> +						count_vm_event(
> +						KSWAPD_HIGH_WMARK_HIT_QUICKLY);

Sorry, but the coding style hits me here do we really need to change
this?

>  				}
>  			}
> -
> -			order = pgdat->kswapd_max_order;
> +			if (pgdat)
> +				order = pgdat->kswapd_max_order;
>  		}
> -		finish_wait(&pgdat->kswapd_wait, &wait);
> +		finish_wait(wait_h, &wait);
> 
>  		ret = try_to_freeze();
>  		if (kthread_should_stop())
> @@ -2476,6 +2500,7 @@ static int kswapd(void *p)
>  void wakeup_kswapd(struct zone *zone, int order)
>  {
>  	pg_data_t *pgdat;
> +	wait_queue_head_t *wait;
> 
>  	if (!populated_zone(zone))
>  		return;
> @@ -2488,9 +2513,10 @@ void wakeup_kswapd(struct zone *zone, int order)
>  	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
>  	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
>  		return;
> -	if (!waitqueue_active(&pgdat->kswapd_wait))
> +	wait = pgdat->kswapd_wait;
> +	if (!waitqueue_active(wait))
>  		return;
> -	wake_up_interruptible(&pgdat->kswapd_wait);
> +	wake_up_interruptible(wait);
>  }
> 
>  /*
> @@ -2587,7 +2613,10 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
> 
>  			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
>  				/* One of our CPUs online: restore mask */
> -				set_cpus_allowed_ptr(pgdat->kswapd, mask);
> +				if (kswapds[nid].kswapd_task)
> +					set_cpus_allowed_ptr(
> +						kswapds[nid].kswapd_task,
> +						mask);
>  		}
>  	}
>  	return NOTIFY_OK;
> @@ -2599,19 +2628,20 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
>   */
>  int kswapd_run(int nid)
>  {
> -	pg_data_t *pgdat = NODE_DATA(nid);
> +	struct task_struct *thr;

thr is an ugly name for task_struct instance

>  	int ret = 0;
> 
> -	if (pgdat->kswapd)
> +	if (kswapds[nid].kswapd_task)
>  		return 0;
> 
> -	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
> -	if (IS_ERR(pgdat->kswapd)) {
> +	thr = kthread_run(kswapd, &kswapds[nid], "kswapd%d", nid);
> +	if (IS_ERR(thr)) {
>  		/* failure at boot is fatal */
>  		BUG_ON(system_state == SYSTEM_BOOTING);
>  		printk("Failed to start kswapd on node %d\n",nid);
>  		ret = -1;

What happens to the threads started?

>  	}
> +	kswapds[nid].kswapd_task = thr;
>  	return ret;
>  }
> 
> @@ -2620,10 +2650,20 @@ int kswapd_run(int nid)
>   */
>  void kswapd_stop(int nid)
>  {
> -	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
> +	struct task_struct *thr;
> +	struct kswapd *kswapd_p;
> +	wait_queue_head_t *wait;
> +
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	spin_lock(&kswapds_spinlock);
> +	wait = pgdat->kswapd_wait;
> +	kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
> +	thr = kswapd_p->kswapd_task;

Sorry, but thr is just an ugly name to use.

> +	spin_unlock(&kswapds_spinlock);
> 
> -	if (kswapd)
> -		kthread_stop(kswapd);
> +	if (thr)
> +		kthread_stop(thr);
>  }
> 
>  static int __init kswapd_init(void)
> -- 
> 1.7.3.1
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2010-12-07  6:52 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-11-30  6:49 [RFC][PATCH 0/4] memcg: per cgroup background reclaim Ying Han
2010-11-30  6:49 ` [PATCH 1/4] Add kswapd descriptor Ying Han
2010-11-30  7:08   ` KAMEZAWA Hiroyuki
2010-11-30  8:15     ` Minchan Kim
2010-11-30  8:27       ` KAMEZAWA Hiroyuki
2010-11-30  8:54         ` KAMEZAWA Hiroyuki
2010-11-30 20:40           ` Ying Han
2010-11-30 23:46             ` KAMEZAWA Hiroyuki
2010-12-07  6:15           ` Balbir Singh
2010-12-07  6:24             ` KAMEZAWA Hiroyuki
2010-12-07  6:59               ` Balbir Singh
2010-12-07  8:00                 ` KAMEZAWA Hiroyuki
2010-11-30 20:26       ` Ying Han
2010-11-30 20:17     ` Ying Han
2010-12-01  0:12       ` KAMEZAWA Hiroyuki
2010-12-07  6:52   ` Balbir Singh [this message]
2010-12-07 19:21     ` Ying Han
2010-12-07 12:33   ` Mel Gorman
2010-12-07 17:28     ` Ying Han
2010-12-08  0:39       ` KAMEZAWA Hiroyuki
2010-12-08  1:24         ` Ying Han
2010-12-08  1:28           ` KAMEZAWA Hiroyuki
2010-12-08  2:10             ` Ying Han
2010-12-08  2:13               ` KAMEZAWA Hiroyuki
2010-12-08 12:19           ` Mel Gorman
2010-12-08  7:21       ` KOSAKI Motohiro
2010-12-07 18:50     ` Ying Han
2010-12-08  7:22     ` KOSAKI Motohiro
2010-12-08  7:37       ` KAMEZAWA Hiroyuki
2010-11-30  6:49 ` [PATCH 2/4] Add per cgroup reclaim watermarks Ying Han
2010-11-30  7:21   ` KAMEZAWA Hiroyuki
2010-11-30 20:44     ` Ying Han
2010-12-01  0:27       ` KAMEZAWA Hiroyuki
2010-12-07 14:56   ` Mel Gorman
2010-11-30  6:49 ` [PATCH 3/4] Per cgroup background reclaim Ying Han
2010-11-30  7:51   ` KAMEZAWA Hiroyuki
2010-11-30  8:07     ` KAMEZAWA Hiroyuki
2010-11-30 22:01       ` Ying Han
2010-11-30 22:00     ` Ying Han
2010-12-07  2:25     ` Ying Han
2010-12-07  5:21       ` KAMEZAWA Hiroyuki
2010-12-01  2:18   ` KOSAKI Motohiro
2010-12-01  2:16     ` KAMEZAWA Hiroyuki
2010-11-30  6:49 ` [PATCH 4/4] Add more per memcg stats Ying Han
2010-11-30  7:53   ` KAMEZAWA Hiroyuki
2010-11-30 18:22     ` Ying Han
2010-11-30  6:54 ` [RFC][PATCH 0/4] memcg: per cgroup background reclaim KOSAKI Motohiro
2010-11-30  7:03   ` Ying Han
2010-12-02 14:41     ` Balbir Singh
2010-12-07  2:29       ` Ying Han
2010-11-30  7:00 ` KAMEZAWA Hiroyuki
2010-11-30  9:05   ` Ying Han

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20101207065242.GI3158@balbir.in.ibm.com \
    --to=balbir@linux.vnet.ibm.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=cl@linux.com \
    --cc=fengguang.wu@intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=mel@csn.ul.ie \
    --cc=nishimura@mxp.nes.nec.co.jp \
    --cc=riel@redhat.com \
    --cc=tj@kernel.org \
    --cc=yinghan@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox