linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/2] oom: remove oom_disable_count
       [not found]                   ` <alpine.DEB.2.00.1108291611070.32495@chino.kir.corp.google.com>
@ 2011-08-30  7:43                     ` David Rientjes
  2011-08-30  7:43                       ` [patch 2/2] oom: fix race while temporarily setting current's oom_score_adj David Rientjes
  2011-08-30 15:28                       ` [patch 1/2] oom: remove oom_disable_count Oleg Nesterov
  0 siblings, 2 replies; 5+ messages in thread
From: David Rientjes @ 2011-08-30  7:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, Oleg Nesterov, KOSAKI Motohiro, linux-kernel, linux-mm

This removes mm->oom_disable_count entirely since it's unnecessary and
currently buggy.  The counter was intended to be per-process but it's
currently decremented in the exit path for each thread that exits, causing
it to underflow.

The count was originally intended to prevent oom killing threads that
share memory with threads that cannot be killed since it doesn't lead to
future memory freeing.  The counter could be fixed to represent all
threads sharing the same mm, but it's better to remove the count since:

 - it is possible that the OOM_DISABLE thread sharing memory with the
   victim is waiting on that thread to exit and will actually cause
   future memory freeing, and

 - there is no guarantee that a thread is disabled from oom killing just
   because another thread sharing its mm is oom disabled.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 fs/exec.c                |    4 ----
 fs/proc/base.c           |   13 -------------
 include/linux/mm_types.h |    3 ---
 kernel/exit.c            |    2 --
 kernel/fork.c            |   10 +---------
 mm/oom_kill.c            |   23 +++++------------------
 6 files changed, 6 insertions(+), 49 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -841,10 +841,6 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
-	if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-		atomic_dec(&old_mm->oom_disable_count);
-		atomic_inc(&tsk->mm->oom_disable_count);
-	}
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1107,13 +1107,6 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		goto err_sighand;
 	}
 
-	if (oom_adjust != task->signal->oom_adj) {
-		if (oom_adjust == OOM_DISABLE)
-			atomic_inc(&task->mm->oom_disable_count);
-		if (task->signal->oom_adj == OOM_DISABLE)
-			atomic_dec(&task->mm->oom_disable_count);
-	}
-
 	/*
 	 * Warn that /proc/pid/oom_adj is deprecated, see
 	 * Documentation/feature-removal-schedule.txt.
@@ -1215,12 +1208,6 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_sighand;
 	}
 
-	if (oom_score_adj != task->signal->oom_score_adj) {
-		if (oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_inc(&task->mm->oom_disable_count);
-		if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&task->mm->oom_disable_count);
-	}
 	task->signal->oom_score_adj = oom_score_adj;
 	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 		task->signal->oom_score_adj_min = oom_score_adj;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -313,9 +313,6 @@ struct mm_struct {
 	unsigned int token_priority;
 	unsigned int last_interval;
 
-	/* How many tasks sharing this mm are OOM_DISABLE */
-	atomic_t oom_disable_count;
-
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
diff --git a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk)
 	enter_lazy_tlb(mm, current);
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_dec(&mm->oom_disable_count);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -501,7 +501,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->cached_hole_size = ~0UL;
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
-	atomic_set(&mm->oom_disable_count, 0);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -816,8 +815,6 @@ good_mm:
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
 	mm->last_interval = 0;
-	if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-		atomic_inc(&mm->oom_disable_count);
 
 	tsk->mm = mm;
 	tsk->active_mm = mm;
@@ -1391,13 +1388,8 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm) {
-		task_lock(p);
-		if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&p->mm->oom_disable_count);
-		task_unlock(p);
+	if (p->mm)
 		mmput(p->mm);
-	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,13 +53,7 @@ int test_set_oom_score_adj(int new_val)
 
 	spin_lock_irq(&sighand->siglock);
 	old_val = current->signal->oom_score_adj;
-	if (new_val != old_val) {
-		if (new_val == OOM_SCORE_ADJ_MIN)
-			atomic_inc(&current->mm->oom_disable_count);
-		else if (old_val == OOM_SCORE_ADJ_MIN)
-			atomic_dec(&current->mm->oom_disable_count);
-		current->signal->oom_score_adj = new_val;
-	}
+	current->signal->oom_score_adj = new_val;
 	spin_unlock_irq(&sighand->siglock);
 
 	return old_val;
@@ -172,16 +166,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
 		return 0;
 
 	/*
-	 * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
-	 * so the entire heuristic doesn't need to be executed for something
-	 * that cannot be killed.
-	 */
-	if (atomic_read(&p->mm->oom_disable_count)) {
-		task_unlock(p);
-		return 0;
-	}
-
-	/*
 	 * The memory controller may have a limit of 0 bytes, so avoid a divide
 	 * by zero, if necessary.
 	 */
@@ -447,6 +431,9 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 	for_each_process(q)
 		if (q->mm == mm && !same_thread_group(q, p) &&
 		    !(q->flags & PF_KTHREAD)) {
+			if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+				continue;
+
 			task_lock(q);	/* Protect ->comm from prctl() */
 			pr_err("Kill process %d (%s) sharing same memory\n",
 				task_pid_nr(q), q->comm);
@@ -723,7 +710,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	read_lock(&tasklist_lock);
 	if (sysctl_oom_kill_allocating_task &&
 	    !oom_unkillable_task(current, NULL, nodemask) &&
-	    current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+	    current->mm) {
 		/*
 		 * oom_kill_process() needs tasklist_lock held.  If it returns
 		 * non-zero, current could not be killed so we must fallback to

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [patch 2/2] oom: fix race while temporarily setting current's oom_score_adj
  2011-08-30  7:43                     ` [patch 1/2] oom: remove oom_disable_count David Rientjes
@ 2011-08-30  7:43                       ` David Rientjes
  2011-08-30 15:57                         ` Oleg Nesterov
  2011-08-30 15:28                       ` [patch 1/2] oom: remove oom_disable_count Oleg Nesterov
  1 sibling, 1 reply; 5+ messages in thread
From: David Rientjes @ 2011-08-30  7:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ying Han, Oleg Nesterov, KOSAKI Motohiro, linux-kernel, linux-mm

test_set_oom_score_adj() was introduced in 72788c385604 (oom: replace
PF_OOM_ORIGIN with toggling oom_score_adj) to temporarily elevate
current's oom_score_adj for ksm and swapoff without requiring an
additional per-process flag.

Using that function to both set oom_score_adj to OOM_SCORE_ADJ_MAX and
then reinstate the previous value is racy since it's possible that
userspace can set the value to something else itself before the old value
is reinstated.  That results in userspace setting current's oom_score_adj
to a different value and then the kernel immediately setting it back to
its previous value without notification.

To fix this, a new compare_swap_oom_score_adj() function is introduced
with the same semantics as the compare and swap CAS instruction, or
CMPXCHG on x86.  It is used to reinstate the previous value of
oom_score_adj if and only if the present value is the same as the old
value.

Signed-off-by: David Rientjes <rientjes@google.com>
---
 include/linux/oom.h |    1 +
 mm/ksm.c            |    3 ++-
 mm/oom_kill.c       |   19 +++++++++++++++++++
 mm/swapfile.c       |    2 +-
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -40,6 +40,7 @@ enum oom_constraint {
 	CONSTRAINT_MEMCG,
 };
 
+extern void compare_swap_oom_score_adj(int old_val, int new_val);
 extern int test_set_oom_score_adj(int new_val);
 
 extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
diff --git a/mm/ksm.c b/mm/ksm.c
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 			oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
 			err = unmerge_and_remove_all_rmap_items();
-			test_set_oom_score_adj(oom_score_adj);
+			compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
+								oom_score_adj);
 			if (err) {
 				ksm_run = KSM_RUN_STOP;
 				count = err;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,6 +38,25 @@ int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
 
+/*
+ * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
+ * @old_val: old oom_score_adj for compare
+ * @new_val: new oom_score_adj for swap
+ *
+ * Sets the oom_score_adj value for current to @new_val iff its present value is
+ * @old_val.  Usually used to reinstate a previous value to prevent racing with
+ * userspacing tuning the value in the interim.
+ */
+void compare_swap_oom_score_adj(int old_val, int new_val)
+{
+	struct sighand_struct *sighand = current->sighand;
+
+	spin_lock_irq(&sighand->siglock);
+	if (current->signal->oom_score_adj == old_val)
+		current->signal->oom_score_adj = new_val;
+	spin_unlock_irq(&sighand->siglock);
+}
+
 /**
  * test_set_oom_score_adj() - set current's oom_score_adj and return old value
  * @new_val: new oom_score_adj value
diff --git a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1617,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
 	err = try_to_unuse(type);
-	test_set_oom_score_adj(oom_score_adj);
+	compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
 
 	if (err) {
 		/*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 1/2] oom: remove oom_disable_count
  2011-08-30  7:43                     ` [patch 1/2] oom: remove oom_disable_count David Rientjes
  2011-08-30  7:43                       ` [patch 2/2] oom: fix race while temporarily setting current's oom_score_adj David Rientjes
@ 2011-08-30 15:28                       ` Oleg Nesterov
  2011-08-30 22:06                         ` David Rientjes
  1 sibling, 1 reply; 5+ messages in thread
From: Oleg Nesterov @ 2011-08-30 15:28 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Ying Han, KOSAKI Motohiro, linux-kernel, linux-mm

On 08/30, David Rientjes wrote:
>
> This removes mm->oom_disable_count entirely since it's unnecessary and
> currently buggy.  The counter was intended to be per-process but it's
> currently decremented in the exit path for each thread that exits, causing
> it to underflow.
>
> The count was originally intended to prevent oom killing threads that
> share memory with threads that cannot be killed since it doesn't lead to
> future memory freeing.  The counter could be fixed to represent all
> threads sharing the same mm, but it's better to remove the count since:
>
>  - it is possible that the OOM_DISABLE thread sharing memory with the
>    victim is waiting on that thread to exit and will actually cause
>    future memory freeing, and
>
>  - there is no guarantee that a thread is disabled from oom killing just
>    because another thread sharing its mm is oom disabled.

Great, thanks.

Even _if_ (I hope not) we decide to re-introduce this counter later,
I think it will be much more simple to start from the very beginning
and make the correct patch.

> @@ -447,6 +431,9 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
>  	for_each_process(q)
>  		if (q->mm == mm && !same_thread_group(q, p) &&
>  		    !(q->flags & PF_KTHREAD)) {

(I guess this is on top of -mm patch)

> +			if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> +				continue;
> +

Afaics, this is the only change apart from "removes mm->oom_disable_count
entirely", looks reasonable to me.

Oleg.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 2/2] oom: fix race while temporarily setting current's oom_score_adj
  2011-08-30  7:43                       ` [patch 2/2] oom: fix race while temporarily setting current's oom_score_adj David Rientjes
@ 2011-08-30 15:57                         ` Oleg Nesterov
  0 siblings, 0 replies; 5+ messages in thread
From: Oleg Nesterov @ 2011-08-30 15:57 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Ying Han, KOSAKI Motohiro, linux-kernel, linux-mm

On 08/30, David Rientjes wrote:
>
> Using that function to both set oom_score_adj to OOM_SCORE_ADJ_MAX and
> then reinstate the previous value is racy since it's possible that
> userspace can set the value to something else itself before the old value
> is reinstated.  That results in userspace setting current's oom_score_adj
> to a different value and then the kernel immediately setting it back to
> its previous value without notification.

Sure,

> To fix this, a new compare_swap_oom_score_adj() function is introduced
> with the same semantics as the compare and swap CAS instruction, or
> CMPXCHG on x86.  It is used to reinstate the previous value of
> oom_score_adj if and only if the present value is the same as the old
> value.

But this can't fix the race completely ?

> +void compare_swap_oom_score_adj(int old_val, int new_val)
> +{
> +	struct sighand_struct *sighand = current->sighand;
> +
> +	spin_lock_irq(&sighand->siglock);
> +	if (current->signal->oom_score_adj == old_val)
> +		current->signal->oom_score_adj = new_val;
> +	spin_unlock_irq(&sighand->siglock);
> +}

So. This is used this way:

	old_val = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);

	do_something();

	compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, old_val);

What if userspace sets oom_score_adj = OOM_SCORE_ADJ_MAX in between?
May be the callers should use OOM_SCORE_ADJ_MAX + 1 instead, this way
we can't confuse old_val with the value from the userspace...




But in fact I am writing this email because I have the question.
Do we really need 2 helpers, and do we really need to allow to set
the arbitrary value?

I mean, perhaps we can do something like

	void set_oom_victim(bool on)
	{
		if (on) {
			oom_score_adj += ADJ_MAX - ADJ_MIN + 1;
		} else if (oom_score_adj > ADJ_MAX) {
			oom_score_adj -= ADJ_MAX - ADJ_MIN + 1;
		}
	}

Not sure this really makes sense, just curious.

Oleg.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch 1/2] oom: remove oom_disable_count
  2011-08-30 15:28                       ` [patch 1/2] oom: remove oom_disable_count Oleg Nesterov
@ 2011-08-30 22:06                         ` David Rientjes
  0 siblings, 0 replies; 5+ messages in thread
From: David Rientjes @ 2011-08-30 22:06 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Andrew Morton, Ying Han, KOSAKI Motohiro, linux-kernel, linux-mm

On Tue, 30 Aug 2011, Oleg Nesterov wrote:

> > @@ -447,6 +431,9 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
> >  	for_each_process(q)
> >  		if (q->mm == mm && !same_thread_group(q, p) &&
> >  		    !(q->flags & PF_KTHREAD)) {
> 
> (I guess this is on top of -mm patch)
> 

Yes, it's based on 
oom-avoid-killing-kthreads-if-they-assume-the-oom-killed-threads-mm.patch 
which I thought would be pushed for the 3.1 rc series, we certainly don't 
want to SIGKILL kthreads :)

> > +			if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
> > +				continue;
> > +
> 
> Afaics, this is the only change apart from "removes mm->oom_disable_count
> entirely", looks reasonable to me.
> 

Yeah, it's necessary because this loop in oom_kill_task() kills all 
user threads in different thread groups unconditionally if they share the 
same mm, so we need to ensure that we aren't sending a SIGKILL to anything 
that is actually oom disabled.  Before, the check in oom_badness() would 
have prevented the task (`p' in this function) from being chosen in the 
first place.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2011-08-30 22:06 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20110727163159.GA23785@redhat.com>
     [not found] ` <20110727163610.GJ23793@redhat.com>
     [not found]   ` <20110727175624.GA3950@redhat.com>
     [not found]     ` <20110728154324.GA22864@redhat.com>
     [not found]       ` <alpine.DEB.2.00.1107281341060.16093@chino.kir.corp.google.com>
     [not found]         ` <20110729141431.GA3501@redhat.com>
     [not found]           ` <20110730143426.GA6061@redhat.com>
     [not found]             ` <20110730152238.GA17424@redhat.com>
     [not found]               ` <4E369372.80105@jp.fujitsu.com>
     [not found]                 ` <20110829183743.GA15216@redhat.com>
     [not found]                   ` <alpine.DEB.2.00.1108291611070.32495@chino.kir.corp.google.com>
2011-08-30  7:43                     ` [patch 1/2] oom: remove oom_disable_count David Rientjes
2011-08-30  7:43                       ` [patch 2/2] oom: fix race while temporarily setting current's oom_score_adj David Rientjes
2011-08-30 15:57                         ` Oleg Nesterov
2011-08-30 15:28                       ` [patch 1/2] oom: remove oom_disable_count Oleg Nesterov
2011-08-30 22:06                         ` David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox