Re: memory reclaim problems on fs usage

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
To: "Arkadiusz Miśkiewicz" <arekm@maven.pl>,
	linux-mm@kvack.org, xfs@oss.sgi.com
Subject: Re: memory reclaim problems on fs usage
Date: Thu, 12 Nov 2015 00:58:03 +0900	[thread overview]
Message-ID: <5643658B.9090206@I-love.SAKURA.ne.jp> (raw)
In-Reply-To: <201511102313.36685.arekm@maven.pl>

On 2015/11/11 7:13, Arkadiusz MiA?kiewicz wrote:
> The usual (repeatable) problem is like this:
>
> full dmesg: http://sprunge.us/VEiE (more in it then in partial log below)

Maybe somebody doing GFP_NOIO allocation which XFS driver doing GFP_NOFS
allocation is waiting for is stalling inside memory allocator. I think that
checking tasks which are stalling inside memory allocator would help.

Please try reproducing this problem with a debug printk() patch shown below
applied. This is a patch which I used for debugging silent lockup problem.
When memory allocation got stuck, lines with MemAlloc keyword will be
printed.

---
  fs/xfs/kmem.c          |  10 ++-
  fs/xfs/xfs_buf.c       |   3 +-
  include/linux/mmzone.h |   1 +
  include/linux/vmstat.h |   1 +
  mm/page_alloc.c        | 217 +++++++++++++++++++++++++++++++++++++++++++++++++
  mm/vmscan.c            |  22 +++++
  6 files changed, 249 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63..535c136 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
  			return ptr;
  		if (!(++retries % 100))
  			xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
-					__func__, lflags);
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+				current->comm, current->pid,
+				__func__, lflags);
  		congestion_wait(BLK_RW_ASYNC, HZ/50);
  	} while (1);
  }
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
  			return ptr;
  		if (!(++retries % 100))
  			xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
-					__func__, lflags);
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+				current->comm, current->pid,
+				__func__, lflags);
  		congestion_wait(BLK_RW_ASYNC, HZ/50);
  	} while (1);
  }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1790b00..16322cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -354,7 +354,8 @@ retry:
  			 */
  			if (!(++retries % 100))
  				xfs_err(NULL,
-		"possible memory allocation deadlock in %s (mode:0x%x)",
+		"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+					current->comm, current->pid,
  					__func__, gfp_mask);

  			XFS_STATS_INC(xb_page_retries);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 54d74f6..932a6d6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -527,6 +527,7 @@ struct zone {
  	ZONE_PADDING(_pad3_)
  	/* Zone statistics */
  	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
+	unsigned long stat_last_updated[NR_VM_ZONE_STAT_ITEMS];
  } ____cacheline_internodealigned_in_smp;

  enum zone_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7..2488925 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -115,6 +115,7 @@ static inline void zone_page_state_add(long x, struct zone *zone,
  {
  	atomic_long_add(x, &zone->vm_stat[item]);
  	atomic_long_add(x, &vm_stat[item]);
+	zone->stat_last_updated[item] = jiffies;
  }

  static inline unsigned long global_page_state(enum zone_stat_item item)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18490f3..35a46b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,8 @@
  #include <linux/hugetlb.h>
  #include <linux/sched/rt.h>
  #include <linux/page_owner.h>
+#include <linux/nmi.h>
+#include <linux/kthread.h>

  #include <asm/sections.h>
  #include <asm/tlbflush.h>
@@ -2496,6 +2498,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  }
  #endif /* CONFIG_COMPACTION */

+pid_t dump_target_pid;
+
  /* Perform direct synchronous page reclaim */
  static int
  __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -2645,6 +2649,208 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
  	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
  }

+static unsigned long kmallocwd_timeout = 10 * HZ; /* Scan interval. */
+static u8 memalloc_counter_active_index; /* Either 0 or 1. */
+static int memalloc_counter[2]; /* Number of tasks doing memory allocation. */
+
+struct memalloc {
+	struct list_head list; /* Connected to memalloc_list. */
+	struct task_struct *task; /* Iniatilized to current. */
+	unsigned long start; /* Initialized to jiffies. */
+	unsigned int order;
+	gfp_t gfp;
+	u8 index; /* Initialized to memalloc_counter_active_index. */
+	u8 dumped;
+};
+
+static LIST_HEAD(memalloc_list); /* List of "struct memalloc".*/
+static DEFINE_SPINLOCK(memalloc_list_lock); /* Lock for memalloc_list. */
+
+/*
+ * kmallocwd - A kernel thread for monitoring memory allocation stalls.
+ *
+ * @unused: Not used.
+ *
+ * This kernel thread does not terminate.
+ */
+static int kmallocwd(void *unused)
+{
+	struct memalloc *m;
+	struct task_struct *g, *p;
+	unsigned long now;
+	unsigned int sigkill_pending;
+	unsigned int memdie_pending;
+	unsigned int stalling_tasks;
+	u8 index;
+	pid_t pid;
+
+ not_stalling: /* Healty case. */
+	/* Switch active counter and wait for timeout duration. */
+	index = memalloc_counter_active_index;
+	spin_lock(&memalloc_list_lock);
+	memalloc_counter_active_index ^= 1;
+	spin_unlock(&memalloc_list_lock);
+	schedule_timeout_interruptible(kmallocwd_timeout);
+	/*
+	 * If memory allocations are working, the counter should remain 0
+	 * because tasks will be able to call both start_memalloc_timer()
+	 * and stop_memalloc_timer() within timeout duration.
+	 */
+	if (likely(!memalloc_counter[index]))
+		goto not_stalling;
+ maybe_stalling: /* Maybe something is wrong. Let's check. */
+	now = jiffies;
+	/* Count stalling tasks, dying and victim tasks. */
+	sigkill_pending = 0;
+	memdie_pending = 0;
+	stalling_tasks = 0;
+	pid = 0;
+	spin_lock(&memalloc_list_lock);
+	list_for_each_entry(m, &memalloc_list, list) {
+		if (time_after(now - m->start, kmallocwd_timeout))
+			stalling_tasks++;
+	}
+	spin_unlock(&memalloc_list_lock);
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (test_tsk_thread_flag(p, TIF_MEMDIE))
+			memdie_pending++;
+		if (fatal_signal_pending(p))
+			sigkill_pending++;
+	}
+	rcu_read_unlock();
+	preempt_enable();
+	cond_resched();
+	pr_warn("MemAlloc-Info: %u stalling task, %u dying task, %u victim task.\n",
+		stalling_tasks, sigkill_pending, memdie_pending);
+	/* Report stalling tasks, dying and victim tasks. */
+	spin_lock(&memalloc_list_lock);
+	list_for_each_entry(m, &memalloc_list, list) {
+		if (time_before(now - m->start, kmallocwd_timeout))
+			continue;
+		p = m->task;
+		pr_warn("MemAlloc: %s(%u) gfp=0x%x order=%u delay=%lu\n",
+			p->comm, p->pid, m->gfp, m->order, now - m->start);
+	}
+	spin_unlock(&memalloc_list_lock);
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		u8 type = 0;
+
+		if (test_tsk_thread_flag(p, TIF_MEMDIE))
+			type |= 1;
+		if (fatal_signal_pending(p))
+			type |= 2;
+		if (likely(!type))
+			continue;
+		if (p->state & TASK_UNINTERRUPTIBLE)
+			type |= 4;
+		pr_warn("MemAlloc: %s(%u)%s%s%s\n", p->comm, p->pid,
+			(type & 4) ? " uninterruptible" : "",
+			(type & 2) ? " dying" : "",
+			(type & 1) ? " victim" : "");
+	}
+	rcu_read_unlock();
+	preempt_enable();
+	cond_resched();
+	/*
+	 * Show traces of newly reported (or too long) stalling tasks.
+	 *
+	 * Show traces only once per 256 timeouts because their traces
+	 * will likely be the same (e.g. cond_sched() or congestion_wait())
+	 * when they are stalling inside __alloc_pages_slowpath().
+	 */
+	spin_lock(&memalloc_list_lock);
+	list_for_each_entry(m, &memalloc_list, list) {
+		if (time_before(now - m->start, kmallocwd_timeout) ||
+		    m->dumped++)
+			continue;
+		p = m->task;
+		sched_show_task(p);
+		debug_show_held_locks(p);
+		touch_nmi_watchdog();
+		if (!pid)
+			pid = p->pid;
+	}
+	spin_unlock(&memalloc_list_lock);
+	/*
+	 * Show traces of dying tasks (including victim tasks).
+	 *
+	 * Only dying tasks which are in trouble (e.g. blocked at unkillable
+	 * locks held by memory allocating tasks) will be repeatedly shown.
+	 * Therefore, we need to pay attention to tasks repeatedly shown here.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (likely(!fatal_signal_pending(p)))
+			continue;
+		sched_show_task(p);
+		debug_show_held_locks(p);
+		touch_nmi_watchdog();
+	}
+	rcu_read_unlock();
+	preempt_enable();
+	show_workqueue_state();
+	if (!dump_target_pid)
+		dump_target_pid = -pid;
+	/* Wait until next timeout duration. */
+	schedule_timeout_interruptible(kmallocwd_timeout);
+	if (memalloc_counter[index])
+		goto maybe_stalling;
+	goto not_stalling;
+	return 0; /* To suppress "no return statement" compiler warning. */
+}
+
+static int __init start_kmallocwd(void)
+{
+	if (kmallocwd_timeout) {
+		struct task_struct *task = kthread_run(kmallocwd, NULL,
+						       "kmallocwd");
+		BUG_ON(IS_ERR(task));
+	}
+	return 0;
+}
+late_initcall(start_kmallocwd);
+
+static int __init kmallocwd_config(char *str)
+{
+	if (kstrtoul(str, 10, &kmallocwd_timeout) == 0)
+		kmallocwd_timeout = min(kmallocwd_timeout * HZ,
+					(unsigned long) LONG_MAX);
+	return 0;
+}
+__setup("kmallocwd=", kmallocwd_config);
+
+static void start_memalloc_timer(struct memalloc *m, const gfp_t gfp_mask,
+				 const int order)
+{
+	if (!kmallocwd_timeout || m->task)
+		return;
+	m->task = current;
+	m->start = jiffies;
+	m->gfp = gfp_mask;
+	m->order = order;
+	m->dumped = 0;
+	spin_lock(&memalloc_list_lock);
+	m->index = memalloc_counter_active_index;
+	memalloc_counter[m->index]++;
+	list_add_tail(&m->list, &memalloc_list);
+	spin_unlock(&memalloc_list_lock);
+}
+
+static void stop_memalloc_timer(struct memalloc *m)
+{
+	if (!m->task)
+		return;
+	spin_lock(&memalloc_list_lock);
+	memalloc_counter[m->index]--;
+	list_del(&m->list);
+	spin_unlock(&memalloc_list_lock);
+}
+
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  						struct alloc_context *ac)
@@ -2657,6 +2863,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  	enum migrate_mode migration_mode = MIGRATE_ASYNC;
  	bool deferred_compaction = false;
  	int contended_compaction = COMPACT_CONTENDED_NONE;
+	struct memalloc m = { .task = NULL };

  	/*
  	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2678,6 +2885,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
  		goto nopage;

  retry:
+	if (dump_target_pid == -current->pid)
+		dump_target_pid = -dump_target_pid;
+
  	if (!(gfp_mask & __GFP_NO_KSWAPD))
  		wake_all_kswapds(order, ac);

@@ -2740,6 +2950,8 @@ retry:
  	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
  		goto nopage;

+	start_memalloc_timer(&m, gfp_mask, order);
+
  	/*
  	 * Try direct compaction. The first pass is asynchronous. Subsequent
  	 * attempts after direct reclaim are synchronous
@@ -2798,6 +3010,10 @@ retry:
  		goto got_pg;

  	/* Check if we should retry the allocation */
+	if (dump_target_pid == current->pid) {
+		printk(KERN_INFO "did_some_progress=%lu\n", did_some_progress);
+		dump_target_pid = 0;
+	}
  	pages_reclaimed += did_some_progress;
  	if (should_alloc_retry(gfp_mask, order, did_some_progress,
  						pages_reclaimed)) {
@@ -2834,6 +3050,7 @@ retry:
  nopage:
  	warn_alloc_failed(gfp_mask, order, NULL);
  got_pg:
+	stop_memalloc_timer(&m);
  	return page;
  }

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1a17bd7..c449371 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2432,6 +2432,8 @@ static inline bool compaction_ready(struct zone *zone, int order)
  	return watermark_ok;
  }

+extern pid_t dump_target_pid;
+
  /*
   * This is the direct reclaim path, for page-allocating processes.  We only
   * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2533,7 +2535,27 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)

  		if (global_reclaim(sc) &&
  		    !reclaimable && zone_reclaimable(zone))
+		{
+			if (dump_target_pid == current->pid) {
+				unsigned long rec = zone_reclaimable_pages(zone);
+				unsigned long free = zone_page_state(zone, NR_FREE_PAGES);
+				unsigned long min = min_wmark_pages(zone);
+				unsigned long scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+				unsigned long now = jiffies;
+				unsigned long rec2 = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+					zone_page_state_snapshot(zone, NR_INACTIVE_FILE);
+				unsigned long free2 = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+				unsigned long scanned2 = zone_page_state_snapshot(zone, NR_PAGES_SCANNED);
+
+				printk(KERN_INFO "%s zone_reclaimable: reclaim:%lu(%lu,%lu,%ld) free:%lu(%lu,%ld) min:%lu pages_scanned:%lu(%lu,%ld) prio:%d\n",
+				       zone->name, rec, now - zone->stat_last_updated[NR_ACTIVE_FILE],
+				       now - zone->stat_last_updated[NR_INACTIVE_FILE], rec - rec2,
+				       free, now - zone->stat_last_updated[NR_FREE_PAGES], free - free2,
+				       min, scanned, now - zone->stat_last_updated[NR_PAGES_SCANNED],
+				       scanned - scanned2, sc->priority);
+			}
  			reclaimable = true;
+		}
  	}

  	/*
-- 
1.8.3.1


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2015-11-11 15:58 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-10 22:13 Arkadiusz Miśkiewicz
2015-11-11 15:58 ` Tetsuo Handa [this message]
2015-11-11 16:19   ` Arkadiusz Miśkiewicz
2015-11-11 22:19     ` Tetsuo Handa
2015-11-12  6:06       ` Arkadiusz Miśkiewicz
2015-11-12 14:12         ` Tetsuo Handa
2015-11-12 20:06           ` Dave Chinner
2015-11-13 12:19             ` Tetsuo Handa
2015-11-12 21:28           ` Arkadiusz Miśkiewicz
2015-11-14 20:40             ` Arkadiusz Miśkiewicz
2015-11-15  2:35               ` Tetsuo Handa
2015-11-15 11:29                 ` Arkadiusz Miśkiewicz
2015-11-15 14:13                   ` Tetsuo Handa
2015-11-15 14:49                     ` Arkadiusz Miśkiewicz
2015-11-16 16:15                       ` Michal Hocko
2015-11-18 22:36                         ` Arkadiusz Miśkiewicz
2015-11-19 15:59                           ` Tetsuo Handa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5643658B.9090206@I-love.SAKURA.ne.jp \
    --to=penguin-kernel@i-love.sakura.ne.jp \
    --cc=arekm@maven.pl \
    --cc=linux-mm@kvack.org \
    --cc=xfs@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox