From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from max.phys.uu.nl (max.phys.uu.nl [131.211.32.73]) by kvack.org (8.8.7/8.8.7) with ESMTP id QAA26346 for ; Sat, 22 Aug 1998 16:18:44 -0400 Date: Sat, 22 Aug 1998 22:11:21 +0200 (CEST) From: Rik van Riel Reply-To: Rik van Riel Subject: [PATCH] OOM killer, version 2 Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: owner-linux-mm@kvack.org To: Samuli Kaski Cc: Claus Fischer , Frank Zeroch , Linux Kernel , Linux MM List-ID: Hi, This patch introduces/fixes three things: - out of memory killing - a nice starting point for newbie kernel hackers (mm/oom_kill.c is full of noteworthy notes) - better handling of the maximum page cache & buffer cache size Rik. +-------------------------------------------------------------------+ | Linux memory management tour guide. H.H.vanRiel@phys.uu.nl | | Scouting Vries cubscout leader. http://www.phys.uu.nl/~riel/ | +-------------------------------------------------------------------+ --- mm/Makefile.orig Sun Aug 16 17:26:38 1998 +++ mm/Makefile Sun Aug 16 17:26:57 1998 @@ -9,7 +9,7 @@ O_TARGET := mm.o O_OBJS := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ - vmalloc.o slab.o \ + vmalloc.o slab.o oom_kill.o\ swap.o vmscan.o page_io.o page_alloc.o swap_state.o swapfile.o include $(TOPDIR)/Rules.make --- mm/oom_kill.c.orig Tue Aug 18 19:24:07 1998 +++ mm/oom_kill.c Sat Aug 22 22:05:21 1998 @@ -1 +1,174 @@ +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from kswapd() + * in linux/mm/vmscan.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ +#include +#include +#include +#include +#include +#include + +#define DEBUG +/* Hmm, I remember a global declaration. Haven't found + * it though... + */ +#define min(a,b) (((a)<(b))?(a):(b)) + +/* + * These definitions should move to linux/include/linux/swapctl.h + * but I want to change as little files as possible while the patch + * is still in alpha -- this will have to change before submission + * however -- Rik. + */ +typedef struct vm_kill_t +{ + unsigned int ram; + unsigned int total; +} vm_kill_t; + +struct vm_kill_t vm_kill = {25, 3}; + +/* + * Wow, black magic :) [read closely, the TCP code is hairier] + */ +inline int int_sqrt(unsigned int x) +{ + unsigned int out = x; + while (x & ~(unsigned int)1) x >>=2, out >>=1; + if (x) out -= out >> 2; + return (out ? out : 1); +} + +/* + * Basically, points = size / (sqrt(CPU_used) * sqrt(sqrt(time_running))) + * with some bonusses/penalties. + * + * The definition of the task_struct, the structure describing the state + * of each process, can be found in include/linux/sched.h. For + * capability info, you should read include/linux/capability.h. + */ + +inline int badness(struct task_struct *p) +{ + int points = p->mm->total_vm; + points /= int_sqrt((p->times.tms_utime + p->times.tms_stime) >> (SHIFT_HZ + 3)); + points /= int_sqrt(int_sqrt((jiffies - p->start_time) >> (SHIFT_HZ + 10))); +/* + * DEF_PRIORITY is the lenght of the standard process priority; + * see include/linux/sched.h for more info. + */ + if (p->priority < DEF_PRIORITY) + points <<= 1; +/* + * p->(e)uid is the process User ID, ID 0 is root, the super user. Since + * the super user can do anything, and does almost nothing (on a proper + * system), we have to assume that the process is trusted/good. + * Besides, the super user usually runs important system services, which + * we don't want to kill... + */ + if (p->uid == 0 || p->euid == 0 || p->cap_effective.cap & CAP_TO_MASK(CAP_SYS_ADMIN)) + points >>= 2; +/* + * NEVER, EVER kill a process with direct hardware acces. Since + * they function almost as a device driver, killing one of those + * might hang the system -- which is something we need to prevent + * at all cost... + */ + if (p->cap_effective.cap & CAP_TO_MASK(CAP_SYS_RAWIO) +#ifdef __i386__ + || p->tss.bitmap == offsetof(struct thread_struct, io_bitmap) +#endif + ) + points = 0; +#ifdef DEBUG + printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + p->pid, p->comm, points); +#endif + return points; +} + +inline struct task_struct * select_bad_process(void) +{ + int points = 0, maxpoints = 0; + struct task_struct *p = NULL; + struct task_struct *chosen = NULL; +/* + * These locks are used to prevent modification of critical + * structures while we're working with them. Remember that + * Linux is a multitasking (and sometimes SMP) system. + * -- Luckily these nice macros are made available so we don't + * have to do cumbersome locking ourselves :) + */ + read_lock(&tasklist_lock); + for_each_task(p) + if (p->pid) + points = badness(p); + if (points > maxpoints) { + chosen = p; + maxpoints = points; + } + read_unlock(&tasklist_lock); + return chosen; +} + +/* + * The SCHED_FIFO magic should make sure that the killed context + * gets absolute priority when killing itself. This should prevent + * a looping kswapd from interfering with the process killing. + * Read kernel/sched.c::goodness() and kernel/sched.c::schedule() + * for more info. + */ +void oom_kill(void) +{ + + struct task_struct *p = select_bad_process(); + if (p == NULL) + return; + printk(KERN_ERR "Out of Memory: Killed process %d (%s).", p->pid, p->comm); + force_sig(SIGKILL, p); + p->policy = SCHED_FIFO; + p->rt_priority = 1000; + current->policy |= SCHED_YIELD; + schedule(); + return; +} + +/* + * Are we out of memory? + * + * We ignore swap cache pages and simplify the situation a bit. + * This won't do any damage, because we're only called when kswapd + * is already failing to free pages and when that is happening we + * can assume that the swap cache is very small. See the test in + * mm/vmscan.c::kswapd() for more info. + */ + +int out_of_memory(void) +{ + struct sysinfo val; + int free_vm, kill_limit; + si_meminfo(&val); + si_swapinfo(&val); + kill_limit = min(vm_kill.ram * (val.totalram >> PAGE_SHIFT), + vm_kill.total * ((val.totalram + val.totalswap) >> PAGE_SHIFT)); + free_vm = ((val.freeram + val.bufferram + val.freeswap) >> + PAGE_SHIFT) + page_cache_size - (page_cache.min_percent + + buffer_mem.min_percent) * num_physpages; + if (free_vm * 100 < kill_limit) + return 1; + return 0; +} --- mm/vmscan.c.orig Sat Aug 22 21:35:53 1998 +++ mm/vmscan.c Sat Aug 22 21:47:53 1998 @@ -28,6 +28,12 @@ #include #include +/* + * OOM kill declarations. Move to .h file before submission ;) + */ +extern int out_of_memory(void); +extern void oom_kill(void); + /* * When are we next due for a page scan? */ @@ -467,7 +473,10 @@ case 0: if (shrink_mmap(i, gfp_mask)) return 1; - state = 1; + /* Don't allow a mode change when page cache or buffermem is over max */ + if (((buffermem >> PAGE_SHIFT) * 100 < buffer_mem.max_percent * num_physpages) && + (page_cache_size * 100 < page_cache.max_percent * num_physpages)) + state = 1; case 1: if (shm_swap(i, gfp_mask)) return 1; @@ -546,7 +555,7 @@ init_swap_timer(); add_wait_queue(&kswapd_wait, &wait); while (1) { - int tries; + int tries, tried, success; current->state = TASK_INTERRUPTIBLE; flush_signals(current); @@ -572,18 +581,23 @@ */ tries = pager_daemon.tries_base; tries >>= 4*free_memory_available(); + tried = success = 0; do { - do_try_to_free_page(0); + if (do_try_to_free_page(0)) + success++; + tried++; /* * Syncing large chunks is faster than swapping * synchronously (less head movement). -- Rik. */ if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster) run_task_queue(&tq_disk); - if (free_memory_available() > 1) + if (free_memory_available() > 1 && tried > pager_daemon.tries_min) break; } while (--tries > 0); + if (success * 4 < tried && out_of_memory()) + oom_kill(); } /* As if we could ever get here - maybe we want to make this killable */ remove_wait_queue(&kswapd_wait, &wait); -- This is a majordomo managed list. To unsubscribe, send a message with the body 'unsubscribe linux-mm me@address' to: majordomo@kvack.org