linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org, Michal Hocko <mhocko@suse.com>,
	Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>,
	Dmitry Vyukov <dvyukov@google.com>,
	Petr Mladek <pmladek@suse.com>,
	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>,
	Steven Rostedt <rostedt@goodmis.org>
Subject: [PATCH 2/4] mm, oom: Avoid potential RCU stall at dump_tasks().
Date: Wed, 22 May 2019 19:08:04 +0900	[thread overview]
Message-ID: <1558519686-16057-2-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <1558519686-16057-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp>

dump_tasks() calls printk() on each userspace process under RCU which
might result in RCU stall warning. I proposed introducing timeout for
dump_tasks() operation, but Michal Hocko expects that dump_tasks() is
either print all or suppress all [1]. Therefore, this patch changes to
cache all candidates at oom_evaluate_task() and then print the cached
candidates at dump_tasks().

With this patch, dump_tasks() no longer need to call printk() under RCU.
Also, dump_tasks() no longer need to check oom_unkillable_task() by
traversing all userspace processes, for select_bad_process() already
traversed all possible candidates. Also, the OOM killer no longer need to
call put_task_struct() from atomic context, and we can simplify refcount
handling for __oom_kill_process().

This patch has a user-visible side effect that oom_kill_allocating_task
path implies oom_dump_tasks == 0, for oom_evaluate_task() is not called
via select_bad_process(). But since the purpose of enabling
oom_kill_allocating_task is to kill as quick as possible (i.e. tolerate
killing more than needed) whereas the purpose of dump_tasks() which might
take minutes is to understand how the OOM killer selected an OOM victim,
not printing candidates should be acceptable when the administrator asked
the OOM killer to kill current thread.

[1] https://lore.kernel.org/linux-mm/20180906115320.GS14951@dhcp22.suse.cz/

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 44 +++++++++++++++++++-------------------------
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1183741..f1736bf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1180,6 +1180,7 @@ struct task_struct {
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
 #endif
+	struct list_head                oom_candidate_list;
 #ifdef CONFIG_VMAP_STACK
 	struct vm_struct		*stack_vm_area;
 #endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7534046..00b594c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -63,6 +63,7 @@
  * and mark_oom_victim
  */
 DEFINE_MUTEX(oom_lock);
+static LIST_HEAD(oom_candidate_list);
 
 #ifdef CONFIG_NUMA
 /**
@@ -333,6 +334,9 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 		goto abort;
 	}
 
+	get_task_struct(task);
+	list_add_tail(&task->oom_candidate_list, &oom_candidate_list);
+
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
 	 * killed first if it triggers an oom, then select it.
@@ -350,16 +354,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
 	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
 		goto next;
 select:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
-	get_task_struct(task);
 	oc->chosen = task;
 	oc->chosen_points = points;
 next:
 	return 0;
 abort:
-	if (oc->chosen)
-		put_task_struct(oc->chosen);
 	oc->chosen = (void *)-1UL;
 	return 1;
 }
@@ -401,11 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 
 	pr_info("Tasks state (memory values in pages):\n");
 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
-	rcu_read_lock();
-	for_each_process(p) {
-		if (oom_unkillable_task(p, memcg, nodemask))
-			continue;
-
+	list_for_each_entry(p, &oom_candidate_list, oom_candidate_list) {
+		cond_resched();
 		task = find_lock_task_mm(p);
 		if (!task) {
 			/*
@@ -424,7 +420,6 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
 	}
-	rcu_read_unlock();
 }
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
@@ -455,7 +450,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 		if (is_dump_unreclaim_slabs())
 			dump_unreclaimable_slab();
 	}
-	if (sysctl_oom_dump_tasks)
+	if (sysctl_oom_dump_tasks && !list_empty(&oom_candidate_list))
 		dump_tasks(oc->memcg, oc->nodemask);
 	if (p)
 		dump_oom_summary(oc, p);
@@ -849,17 +844,11 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
 	struct mm_struct *mm;
 	bool can_oom_reap = true;
 
-	p = find_lock_task_mm(victim);
-	if (!p) {
-		put_task_struct(victim);
-		return;
-	} else if (victim != p) {
-		get_task_struct(p);
-		put_task_struct(victim);
-		victim = p;
-	}
-
 	/* Get a reference to safely compare mm after task_unlock(victim) */
+	victim = find_lock_task_mm(victim);
+	if (!victim)
+		return;
+	get_task_struct(victim);
 	mm = victim->mm;
 	mmgrab(mm);
 
@@ -931,7 +920,6 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message)
 {
 	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
 	    !is_global_init(task)) {
-		get_task_struct(task);
 		__oom_kill_process(task, message);
 	}
 	return 0;
@@ -954,7 +942,6 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 		mark_oom_victim(victim);
 		wake_oom_reaper(victim);
 		task_unlock(victim);
-		put_task_struct(victim);
 		return;
 	}
 	task_unlock(victim);
@@ -1077,7 +1064,6 @@ bool out_of_memory(struct oom_control *oc)
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
-		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
 		return true;
@@ -1099,6 +1085,14 @@ bool out_of_memory(struct oom_control *oc)
 	if (oc->chosen && oc->chosen != (void *)-1UL)
 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
 				 "Memory cgroup out of memory");
+	while (!list_empty(&oom_candidate_list)) {
+		struct task_struct *p = list_first_entry(&oom_candidate_list,
+							 struct task_struct,
+							 oom_candidate_list);
+
+		list_del(&p->oom_candidate_list);
+		put_task_struct(p);
+	}
 	return !!oc->chosen;
 }
 
-- 
1.8.3.1


  reply	other threads:[~2019-05-22 10:46 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-22 10:08 [PATCH 1/4] mm, oom: Remove redundant OOM score normalization at select_bad_process() Tetsuo Handa
2019-05-22 10:08 ` Tetsuo Handa [this message]
2019-05-22 10:08 ` [PATCH 3/4] mm, oom: Wait for OOM victims even if oom_kill_allocating_task case Tetsuo Handa
2019-05-22 10:08 ` [PATCH 4/4] mm, oom: Respect oom_task_origin() " Tetsuo Handa
2019-05-23 22:04 ` [PATCH 5/4] mm, oom: Deduplicate memcg candidates at select_bad_process() Tetsuo Handa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1558519686-16057-2-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp \
    --to=penguin-kernel@i-love.sakura.ne.jp \
    --cc=akpm@linux-foundation.org \
    --cc=dvyukov@google.com \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=pmladek@suse.com \
    --cc=rostedt@goodmis.org \
    --cc=sergey.senozhatsky@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox