linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	vedran.furac@gmail.com, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	"hugh.dickins@tiscali.co.uk" <hugh.dickins@tiscali.co.uk>,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	rientjes@google.com
Subject: [RFC][PATCH] oom_kill: avoid depends on total_vm and use real RSS/swap value for oom_score (Re: Memory overcommit
Date: Tue, 27 Oct 2009 16:45:26 +0900	[thread overview]
Message-ID: <20091027164526.da6a23cb.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <28c262360910262355p3cac5c1bla4de9d42ea67fb4e@mail.gmail.com>

On Tue, 27 Oct 2009 15:55:26 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> >> Hmm.
> >> I wonder why we consider VM size for OOM kiling.
> >> How about RSS size?
> >>
> >
> > Maybe the current code assumes "Tons of swap have been generated, already" if
> > oom-kill is invoked. Then, just using mm->anon_rss will not be correct.
> >
> > Hm, should we count # of swap entries reference from mm ?....
> 
> In Vedran case, he didn't use swap. So, Only considering vm is the problem.
> I think it would be better to consider both RSS + # of swap entries as
> Kosaki mentioned.
> 
Then, maybe this kind of patch is necessary.
This is on 2.6.31...then I may have to rebase this to mmotom.
Added more CCs.

Vedran, I'm glad if you can test this patch.


==
Now, oom-killer's score uses mm->total_vm as its base value.
But, in these days, applications like GUI program tend to use
much shared libraries and total_vm grows too high even when
pages are not fully mapped.

For example, running a program "mmap" which allocates 1 GBbytes of
anonymous memory, oom_score top 10 on system will be..

 score  PID     name
 89924	3938	mixer_applet2
 90210	3942	tomboy
 94753	3936	clock-applet
 101994	3919	pulseaudio
 113525	4028	gnome-terminal
 127340	1	init
 128177	3871	nautilus
 151003	11515	bash
 256944	11653	mmap <-----------------use 1G of anon
 425561	3829	gnome-session

No one believes gnome-session is more guilty than "mmap".

Instead of total_vm, we should use anon/file/swap usage of a process, I think.
This patch adds mm->swap_usage and calculate oom_score based on
  anon_rss + file_rss + swap_usage.
Considering usual applications, this will be much better information than
total_vm. After this patch, the score on my desktop is

score   PID     name
4033	3176	gnome-panel
4077	3113	xinit
4526	3190	python
4820	3161	gnome-settings-
4989	3289	gnome-terminal
7105	3271	tomboy
8427	3177	nautilus
17549	3140	gnome-session
128501	3299	bash
256106	3383	mmap

This order is not bad, I think.

Note: This adss new counter...then new cost is added.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/mm_types.h |    1 +
 mm/memory.c              |   29 +++++++++++++++++++++--------
 mm/oom_kill.c            |   12 +++++++++---
 mm/rmap.c                |    1 +
 mm/swapfile.c            |    1 +
 5 files changed, 33 insertions(+), 11 deletions(-)

Index: linux-2.6.31/include/linux/mm_types.h
===================================================================
--- linux-2.6.31.orig/include/linux/mm_types.h
+++ linux-2.6.31/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6.31/mm/memory.c
===================================================================
--- linux-2.6.31.orig/mm/memory.c
+++ linux-2.6.31/mm/memory.c
@@ -361,12 +361,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline
+void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swaps)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swaps)
+		add_mm_counter(mm, swap_usage, swaps);
 }
 
 /*
@@ -562,6 +565,8 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
+			if (!is_migration_entry(entry))
+				rss[2]++;
 			if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
@@ -611,10 +616,10 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -645,7 +650,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(dst_pte - 1, dst_ptl);
 	cond_resched();
 	if (addr != end)
@@ -769,6 +774,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swaps = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -838,13 +844,19 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t entry = pte_to_swp_entry(ptent);
+
+			if (!is_migration_entry(entry))
+				swaps++;
+
+			if (unlikely(!free_swap_and_cache(entry)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swaps);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2573,6 +2585,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: linux-2.6.31/mm/rmap.c
===================================================================
--- linux-2.6.31.orig/mm/rmap.c
+++ linux-2.6.31/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: linux-2.6.31/mm/swapfile.c
===================================================================
--- linux-2.6.31.orig/mm/swapfile.c
+++ linux-2.6.31/mm/swapfile.c
@@ -830,6 +830,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6.31/mm/oom_kill.c
===================================================================
--- linux-2.6.31.orig/mm/oom_kill.c
+++ linux-2.6.31/mm/oom_kill.c
@@ -69,7 +69,8 @@ unsigned long badness(struct task_struct
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */
-	points = mm->total_vm;
+	points = get_mm_counter(mm, anon_rss) + get_mm_counter(mm, file_rss)
+		 + get_mm_counter(mm, swap_usage);
 
 	/*
 	 * After this unlock we can no longer dereference local variable `mm'
@@ -92,8 +93,13 @@ unsigned long badness(struct task_struct
 	 */
 	list_for_each_entry(child, &p->children, sibling) {
 		task_lock(child);
-		if (child->mm != mm && child->mm)
-			points += child->mm->total_vm/2 + 1;
+		if (child->mm != mm && child->mm) {
+			unsigned long cpoint;
+			/* At considering child, we don't count swap */
+			cpoint = get_mm_counter(child->mm, anon_rss) +
+				 get_mm_counter(child->mm, file_rss);
+			points += cpoint/2 + 1;
+		}
 		task_unlock(child);
 	}
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2009-10-27  7:47 UTC|newest]

Thread overview: 77+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <hav57c$rso$1@ger.gmane.org>
     [not found] ` <20091013120840.a844052d.kamezawa.hiroyu@jp.fujitsu.com>
     [not found]   ` <hb2cfu$r08$2@ger.gmane.org>
     [not found]     ` <20091014135119.e1baa07f.kamezawa.hiroyu@jp.fujitsu.com>
2009-10-20 21:52       ` Vedran Furač
2009-10-26  1:55         ` KAMEZAWA Hiroyuki
2009-10-26 16:16           ` Vedran Furač
2009-10-27  3:22             ` KAMEZAWA Hiroyuki
2009-10-27  6:10               ` KOSAKI Motohiro
2009-10-27  6:34                 ` Minchan Kim
2009-10-27  6:36                   ` KAMEZAWA Hiroyuki
2009-10-27  6:55                     ` Minchan Kim
2009-10-27  7:45                       ` KAMEZAWA Hiroyuki [this message]
2009-10-27  7:56                         ` [RFC][PATCH] oom_kill: avoid depends on total_vm and use real RSS/swap value for oom_score (Re: " Minchan Kim
2009-10-27 12:38                           ` Andrea Arcangeli
2009-10-28  0:22                             ` KAMEZAWA Hiroyuki
2009-10-28  0:45                               ` Vedran Furač
2009-10-27  7:56                         ` KAMEZAWA Hiroyuki
2009-10-27  8:14                           ` Minchan Kim
2009-10-27  8:33                             ` KAMEZAWA Hiroyuki
2009-10-27  8:52                               ` Minchan Kim
2009-10-27  8:56                                 ` KAMEZAWA Hiroyuki
2009-10-27 17:41                         ` Vedran Furač
2009-10-28  0:13                           ` KAMEZAWA Hiroyuki
2009-10-27 18:39                         ` Hugh Dickins
2009-10-27 18:47                           ` Andrea Arcangeli
2009-10-28  0:32                             ` KAMEZAWA Hiroyuki
2009-11-05 19:02                             ` Pavel Machek
2009-10-28  0:28                           ` KAMEZAWA Hiroyuki
2009-10-27  6:46                   ` KOSAKI Motohiro
2009-10-27  6:56                     ` Minchan Kim
2009-10-27 17:12               ` Vedran Furač
2009-10-27 18:02                 ` KOSAKI Motohiro
2009-10-27 18:30                   ` Vedran Furač
2009-10-27 20:44               ` Hugh Dickins
2009-10-27 21:04                 ` David Rientjes
2009-10-28  0:08                   ` Vedran Furač
2009-10-28  0:25                     ` David Rientjes
2009-10-28  0:39                       ` Vedran Furač
2009-10-28  4:08                         ` David Rientjes
2009-10-28  4:55                           ` KAMEZAWA Hiroyuki
2009-10-28  5:13                             ` David Rientjes
2009-10-28  6:05                               ` KAMEZAWA Hiroyuki
2009-10-28  6:17                                 ` David Rientjes
2009-10-28  6:20                                   ` KAMEZAWA Hiroyuki
2009-10-29  8:38                                     ` David Rientjes
2009-10-29 11:11                                       ` Vedran Furač
2009-10-29 19:53                                         ` David Rientjes
2009-10-29 23:48                                           ` KAMEZAWA Hiroyuki
2009-10-30  9:10                                             ` David Rientjes
2009-10-30  9:36                                               ` KAMEZAWA Hiroyuki
2009-11-03 20:49                                                 ` David Rientjes
2009-11-04  0:50                                                   ` KAMEZAWA Hiroyuki
2009-11-04  1:58                                                     ` David Rientjes
2009-11-04  2:17                                                       ` KAMEZAWA Hiroyuki
2009-11-04  3:10                                                         ` David Rientjes
2009-11-04  3:19                                                           ` KAMEZAWA Hiroyuki
2009-10-30 13:59                                           ` Vedran Furač
2009-10-30 19:24                                             ` David Rientjes
2009-11-02 19:58                                               ` Vedran Furač
2009-10-28 13:28                           ` Vedran Furač
2009-10-28 20:10                             ` David Rientjes
2009-10-29  3:05                               ` Vedran Furač
2009-10-29  8:35                                 ` David Rientjes
2009-10-29 11:01                                   ` Vedran Furač
2009-10-29 19:42                                     ` David Rientjes
2009-10-30 13:53                                       ` Vedran Furač
2009-10-30 14:08                                         ` Thomas Fjellstrom
2009-10-30 15:13                                           ` Vedran Furač
2009-10-30 14:12                                         ` Andrea Arcangeli
2009-10-30 14:41                                           ` Vedran Furač
2009-10-30 15:15                                             ` Andrea Arcangeli
2009-10-30 16:24                                               ` Hugh Dickins
2009-11-02 19:56                                               ` Vedran Furač
2009-10-30 19:44                                         ` David Rientjes
2009-11-02 19:56                                           ` Vedran Furač
2009-10-28  0:43                 ` KAMEZAWA Hiroyuki
2009-10-28  2:47                 ` KOSAKI Motohiro
2009-10-28  3:17                   ` KAMEZAWA Hiroyuki
2009-10-28  4:12                   ` David Rientjes
2009-10-28  8:10                     ` Hugh Dickins

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091027164526.da6a23cb.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=hugh.dickins@tiscali.co.uk \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan.kim@gmail.com \
    --cc=rientjes@google.com \
    --cc=vedran.furac@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox