Re: [patch 09/18] oom: select task from tasklist for mempolicy ooms

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
To: David Rientjes <rientjes@google.com>
Cc: kosaki.motohiro@jp.fujitsu.com,
	Andrew Morton <akpm@linux-foundation.org>,
	Rik van Riel <riel@redhat.com>, Nick Piggin <npiggin@suse.de>,
	Oleg Nesterov <oleg@redhat.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>,
	linux-mm@kvack.org
Subject: Re: [patch 09/18] oom: select task from tasklist for mempolicy ooms
Date: Tue,  8 Jun 2010 20:41:59 +0900 (JST)	[thread overview]
Message-ID: <20100608203529.7669.A69D9226@jp.fujitsu.com> (raw)
In-Reply-To: <alpine.DEB.2.00.1006061525000.32225@chino.kir.corp.google.com>

> The oom killer presently kills current whenever there is no more memory
> free or reclaimable on its mempolicy's nodes.  There is no guarantee that
> current is a memory-hogging task or that killing it will free any
> substantial amount of memory, however.
> 
> In such situations, it is better to scan the tasklist for nodes that are
> allowed to allocate on current's set of nodes and kill the task with the
> highest badness() score.  This ensures that the most memory-hogging task,
> or the one configured by the user with /proc/pid/oom_adj, is always
> selected in such scenarios.
> 
> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Signed-off-by: David Rientjes <rientjes@google.com>
> ---
>  include/linux/mempolicy.h |   13 +++++++-
>  mm/mempolicy.c            |   44 ++++++++++++++++++++++++
>  mm/oom_kill.c             |   80 +++++++++++++++++++++++++++-----------------
>  3 files changed, 105 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -210,6 +210,8 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
>  				unsigned long addr, gfp_t gfp_flags,
>  				struct mempolicy **mpol, nodemask_t **nodemask);
>  extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
> +extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
> +				const nodemask_t *mask);
>  extern unsigned slab_node(struct mempolicy *policy);
>  
>  extern enum zone_type policy_zone;
> @@ -338,7 +340,16 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
>  	return node_zonelist(0, gfp_flags);
>  }
>  
> -static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; }
> +static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
> +{
> +	return false;
> +}
> +
> +static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
> +			const nodemask_t *mask)
> +{
> +	return false;
> +}
>  
>  static inline int do_migrate_pages(struct mm_struct *mm,
>  			const nodemask_t *from_nodes,
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1712,6 +1712,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  }
>  #endif
>  
> +/*
> + * mempolicy_nodemask_intersects
> + *
> + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
> + * policy.  Otherwise, check for intersection between mask and the policy
> + * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
> + * policy, always return true since it may allocate elsewhere on fallback.
> + *
> + * Takes task_lock(tsk) to prevent freeing of its mempolicy.
> + */
> +bool mempolicy_nodemask_intersects(struct task_struct *tsk,
> +					const nodemask_t *mask)
> +{
> +	struct mempolicy *mempolicy;
> +	bool ret = true;
> +
> +	if (!mask)
> +		return ret;
> +	task_lock(tsk);
> +	mempolicy = tsk->mempolicy;
> +	if (!mempolicy)
> +		goto out;
> +
> +	switch (mempolicy->mode) {
> +	case MPOL_PREFERRED:
> +		/*
> +		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
> +		 * allocate from, they may fallback to other nodes when oom.
> +		 * Thus, it's possible for tsk to have allocated memory from
> +		 * nodes in mask.
> +		 */
> +		break;
> +	case MPOL_BIND:
> +	case MPOL_INTERLEAVE:
> +		ret = nodes_intersects(mempolicy->v.nodes, *mask);
> +		break;
> +	default:
> +		BUG();
> +	}
> +out:
> +	task_unlock(tsk);
> +	return ret;
> +}
> +
>  /* Allocate a page in interleaved policy.
>     Own path because it needs to do special accounting. */
>  static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -27,6 +27,7 @@
>  #include <linux/module.h>
>  #include <linux/notifier.h>
>  #include <linux/memcontrol.h>
> +#include <linux/mempolicy.h>
>  #include <linux/security.h>
>  
>  int sysctl_panic_on_oom;
> @@ -36,20 +37,36 @@ static DEFINE_SPINLOCK(zone_scan_lock);
>  /* #define DEBUG */
>  
>  /*
> - * Is all threads of the target process nodes overlap ours?
> + * Do all threads of the target process overlap our allowed nodes?
> + * @tsk: task struct of which task to consider
> + * @mask: nodemask passed to page allocator for mempolicy ooms
>   */
> -static int has_intersects_mems_allowed(struct task_struct *tsk)
> +static bool has_intersects_mems_allowed(struct task_struct *tsk,
> +					const nodemask_t *mask)
>  {
> -	struct task_struct *t;
> +	struct task_struct *start = tsk;
>  
> -	t = tsk;
>  	do {
> -		if (cpuset_mems_allowed_intersects(current, t))
> -			return 1;
> -		t = next_thread(t);
> -	} while (t != tsk);
> -
> -	return 0;
> +		if (mask) {
> +			/*
> +			 * If this is a mempolicy constrained oom, tsk's
> +			 * cpuset is irrelevant.  Only return true if its
> +			 * mempolicy intersects current, otherwise it may be
> +			 * needlessly killed.
> +			 */
> +			if (mempolicy_nodemask_intersects(tsk, mask))
> +				return true;
> +		} else {
> +			/*
> +			 * This is not a mempolicy constrained oom, so only
> +			 * check the mems of tsk's cpuset.
> +			 */
> +			if (cpuset_mems_allowed_intersects(current, tsk))
> +				return true;
> +		}
> +		tsk = next_thread(tsk);
> +	} while (tsk != start);
> +	return false;
>  }
>  
>  static struct task_struct *find_lock_task_mm(struct task_struct *p)
> @@ -253,7 +270,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
>   * (not docbooked, we don't want this one cluttering up the manual)
>   */
>  static struct task_struct *select_bad_process(unsigned long *ppoints,
> -						struct mem_cgroup *mem)
> +		struct mem_cgroup *mem, enum oom_constraint constraint,
> +		const nodemask_t *mask)
>  {
>  	struct task_struct *p;
>  	struct task_struct *chosen = NULL;
> @@ -269,7 +287,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
>  			continue;
>  		if (mem && !task_in_mem_cgroup(p, mem))
>  			continue;
> -		if (!has_intersects_mems_allowed(p))
> +		if (!has_intersects_mems_allowed(p,
> +				constraint == CONSTRAINT_MEMORY_POLICY ? mask :
> +									 NULL))
>  			continue;
>  
>  		/*
> @@ -495,7 +515,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
>  		panic("out of memory(memcg). panic_on_oom is selected.\n");
>  	read_lock(&tasklist_lock);
>  retry:
> -	p = select_bad_process(&points, mem);
> +	p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
>  	if (!p || PTR_ERR(p) == -1UL)
>  		goto out;
>  
> @@ -574,7 +594,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
>  /*
>   * Must be called with tasklist_lock held for read.
>   */
> -static void __out_of_memory(gfp_t gfp_mask, int order)
> +static void __out_of_memory(gfp_t gfp_mask, int order,
> +			enum oom_constraint constraint, const nodemask_t *mask)
>  {
>  	struct task_struct *p;
>  	unsigned long points;
> @@ -588,7 +609,7 @@ retry:
>  	 * Rambo mode: Shoot down a process and hope it solves whatever
>  	 * issues we may have.
>  	 */
> -	p = select_bad_process(&points, NULL);
> +	p = select_bad_process(&points, NULL, constraint, mask);
>  
>  	if (PTR_ERR(p) == -1UL)
>  		return;
> @@ -622,7 +643,8 @@ void pagefault_out_of_memory(void)
>  		panic("out of memory from page fault. panic_on_oom is selected.\n");
>  
>  	read_lock(&tasklist_lock);
> -	__out_of_memory(0, 0); /* unknown gfp_mask and order */
> +	/* unknown gfp_mask and order */
> +	__out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
>  	read_unlock(&tasklist_lock);
>  
>  	/*
> @@ -638,6 +660,7 @@ void pagefault_out_of_memory(void)
>   * @zonelist: zonelist pointer
>   * @gfp_mask: memory allocation flags
>   * @order: amount of memory being requested as a power of 2
> + * @nodemask: nodemask passed to page allocator
>   *
>   * If we run out of memory, we have the choice between either
>   * killing a random task (bad), letting the system crash (worse)
> @@ -676,24 +699,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
>  	 */
>  	constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
>  	read_lock(&tasklist_lock);
> -
> -	switch (constraint) {
> -	case CONSTRAINT_MEMORY_POLICY:
> -		oom_kill_process(current, gfp_mask, order, 0, NULL,
> -				"No available memory (MPOL_BIND)");
> -		break;
> -
> -	case CONSTRAINT_NONE:
> -		if (sysctl_panic_on_oom) {
> +	if (unlikely(sysctl_panic_on_oom)) {
> +		/*
> +		 * panic_on_oom only affects CONSTRAINT_NONE, the kernel
> +		 * should not panic for cpuset or mempolicy induced memory
> +		 * failures.
> +		 */
> +		if (constraint == CONSTRAINT_NONE) {
>  			dump_header(NULL, gfp_mask, order, NULL);
> -			panic("out of memory. panic_on_oom is selected\n");
> +			read_unlock(&tasklist_lock);
> +			panic("Out of memory: panic_on_oom is enabled\n");
>  		}
> -		/* Fall-through */
> -	case CONSTRAINT_CPUSET:
> -		__out_of_memory(gfp_mask, order);
> -		break;
>  	}
> -
> +	__out_of_memory(gfp_mask, order, constraint, nodemask);
>  	read_unlock(&tasklist_lock);
>  
>  	/*

pulled.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2010-06-08 11:42 UTC|newest]

Thread overview: 104+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-06-06 22:33 [patch 00/18] oom killer rewrite David Rientjes
2010-06-06 22:34 ` [patch 01/18] oom: check PF_KTHREAD instead of !mm to skip kthreads David Rientjes
2010-06-07 12:12   ` Balbir Singh
2010-06-07 19:50     ` David Rientjes
2010-06-08 19:33   ` Andrew Morton
2010-06-08 23:40     ` David Rientjes
2010-06-08 23:52       ` Andrew Morton
2010-06-06 22:34 ` [patch 02/18] oom: introduce find_lock_task_mm() to fix !mm false positives David Rientjes
2010-06-07 12:58   ` Balbir Singh
2010-06-07 13:49     ` Minchan Kim
2010-06-07 19:49       ` David Rientjes
2010-06-08 19:42   ` Andrew Morton
2010-06-08 20:14     ` Oleg Nesterov
2010-06-08 20:17       ` Oleg Nesterov
2010-06-08 21:34         ` Andrew Morton
2010-06-08 23:50     ` David Rientjes
2010-06-06 22:34 ` [patch 03/18] oom: dump_tasks use find_lock_task_mm too David Rientjes
2010-06-08 19:55   ` Andrew Morton
2010-06-09  0:06     ` David Rientjes
2010-06-06 22:34 ` [patch 04/18] oom: PF_EXITING check should take mm into account David Rientjes
2010-06-08 20:00   ` Andrew Morton
2010-06-06 22:34 ` [patch 05/18] oom: give current access to memory reserves if it has been killed David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro
2010-06-08 18:47     ` David Rientjes
2010-06-14 11:08       ` KOSAKI Motohiro
2010-06-08 20:12     ` Andrew Morton
2010-06-13 11:24       ` KOSAKI Motohiro
2010-06-08 20:08   ` Andrew Morton
2010-06-09  0:14     ` David Rientjes
2010-06-06 22:34 ` [patch 06/18] oom: avoid sending exiting tasks a SIGKILL David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro
2010-06-08 18:48     ` David Rientjes
2010-06-08 20:17   ` Andrew Morton
2010-06-08 20:26   ` Oleg Nesterov
2010-06-09  6:32     ` David Rientjes
2010-06-09 16:25       ` Oleg Nesterov
2010-06-09 19:44         ` David Rientjes
2010-06-09 20:14           ` Oleg Nesterov
2010-06-10  0:15             ` KAMEZAWA Hiroyuki
2010-06-10  1:21               ` Oleg Nesterov
2010-06-10  1:43                 ` KAMEZAWA Hiroyuki
2010-06-10  1:51                   ` Oleg Nesterov
2010-06-06 22:34 ` [patch 07/18] oom: filter tasks not sharing the same cpuset David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro
2010-06-08 18:51     ` David Rientjes
2010-06-08 19:27       ` Andrew Morton
2010-06-13 11:24         ` KOSAKI Motohiro
2010-07-02 22:35           ` Andrew Morton
2010-07-04 22:08             ` David Rientjes
2010-07-09  3:00             ` KOSAKI Motohiro
2010-06-08 20:23   ` Andrew Morton
2010-06-09  0:25     ` David Rientjes
2010-06-06 22:34 ` [patch 08/18] oom: sacrifice child with highest badness score for parent David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro
2010-06-08 18:53     ` David Rientjes
2010-06-08 20:33   ` Andrew Morton
2010-06-09  0:30     ` David Rientjes
2010-06-06 22:34 ` [patch 09/18] oom: select task from tasklist for mempolicy ooms David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro [this message]
2010-06-08 21:08   ` Andrew Morton
2010-06-08 21:17     ` Oleg Nesterov
2010-06-09  0:46     ` David Rientjes
2010-06-08 23:43   ` Andrew Morton
2010-06-09  0:40     ` David Rientjes
2010-06-06 22:34 ` [patch 10/18] oom: enable oom tasklist dump by default David Rientjes
2010-06-08 11:42   ` KOSAKI Motohiro
2010-06-08 18:56     ` David Rientjes
2010-06-08 21:13   ` Andrew Morton
2010-06-09  0:52     ` David Rientjes
2010-06-06 22:34 ` [patch 11/18] oom: avoid oom killer for lowmem allocations David Rientjes
2010-06-08 11:42   ` KOSAKI Motohiro
2010-06-08 21:19   ` Andrew Morton
2010-06-06 22:34 ` [patch 12/18] oom: extract panic helper function David Rientjes
2010-06-08 11:42   ` KOSAKI Motohiro
2010-06-06 22:34 ` [patch 13/18] oom: remove special handling for pagefault ooms David Rientjes
2010-06-08 11:42   ` KOSAKI Motohiro
2010-06-08 18:57     ` David Rientjes
2010-06-08 21:27   ` Andrew Morton
2010-06-06 22:34 ` [patch 14/18] oom: move sysctl declarations to oom.h David Rientjes
2010-06-08 11:42   ` KOSAKI Motohiro
2010-06-06 22:34 ` [patch 15/18] oom: remove unnecessary code and cleanup David Rientjes
2010-06-06 22:34 ` [patch 16/18] oom: badness heuristic rewrite David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro
2010-06-08 23:02     ` Andrew Morton
2010-06-13 11:24       ` KOSAKI Motohiro
2010-06-17  5:14       ` David Rientjes
2010-06-21 11:45         ` KOSAKI Motohiro
2010-06-21 20:47           ` David Rientjes
2010-06-30  9:26             ` KOSAKI Motohiro
2010-06-17  5:12     ` David Rientjes
2010-06-21 11:45       ` KOSAKI Motohiro
2010-06-08 22:58   ` Andrew Morton
2010-06-17  5:32     ` David Rientjes
2010-06-06 22:34 ` [patch 17/18] oom: add forkbomb penalty to badness heuristic David Rientjes
2010-06-08 11:41   ` KOSAKI Motohiro
2010-06-08 23:15   ` Andrew Morton
2010-06-06 22:35 ` [patch 18/18] oom: deprecate oom_adj tunable David Rientjes
2010-06-08 11:42   ` KOSAKI Motohiro
2010-06-08 19:00     ` David Rientjes
2010-06-08 23:18     ` Andrew Morton
2010-06-13 11:24       ` KOSAKI Motohiro
2010-06-17  3:36         ` David Rientjes
2010-06-21 11:45           ` KOSAKI Motohiro
2010-06-21 20:54             ` David Rientjes

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100608203529.7669.A69D9226@jp.fujitsu.com \
    --to=kosaki.motohiro@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=npiggin@suse.de \
    --cc=oleg@redhat.com \
    --cc=riel@redhat.com \
    --cc=rientjes@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox