* [RFC][PATCH 1/3] forkbomb: introduce mm recorder
2011-03-23 4:23 [RFC][PATCH 0/3] A forkbomb killer and mm tracking system KAMEZAWA Hiroyuki
@ 2011-03-23 4:26 ` KAMEZAWA Hiroyuki
2011-03-23 4:29 ` [RFC][PATCH 2/3] forkbomb: forgetting useless information KAMEZAWA Hiroyuki
2011-03-23 4:30 ` [RFC][PATCH 3/3] forkbomb: forkbomb killer KAMEZAWA Hiroyuki
2 siblings, 0 replies; 4+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-03-23 4:26 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel, linux-mm, kosaki.motohiro, rientjes, akpm,
Oleg Nesterov, minchan.kim, avagin, kirill
One of famous fork-bomb which is hard to catch is a forkbomb
which includes exit(). For example, a shell script
can cause fork bomb
== (from wikipedia)
#!/bin/bash
forkbomb(){ forkbomb|forkbomb & } ; forkbomb
==
In this program, when oom happens, most of root tasks of forkbomb are
already dead (children becomes orphan).So, its hard to track all tasks
for kernel.
This patch implements a link between mm_struct. It doesn't disapear until all
children of task are dead even if the task is dead and mm_struct are freed.
(This can cause leak of memory, following patch will add some aging.)
Fork-Bomb killer in following patch just allows one trial at the moment
So, This patch uses a kind of read-write lock
Write v.s. Write is guarded by small spin locks
Write v.s. Scanning is guarded by a big lock, with support of percpu.
This patch is a part of patches and includes
- hooks for fork/exit/exec
- structure definition
- add/delete code
- scan code.
- Kconfig.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
fs/exec.c | 1
include/linux/mm_types.h | 3
include/linux/oom.h | 37 ++++++++
kernel/fork.c | 2
mm/Kconfig | 13 +++
mm/oom_kill.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 257 insertions(+)
Index: mm-work/include/linux/mm_types.h
===================================================================
--- mm-work.orig/include/linux/mm_types.h
+++ mm-work/include/linux/mm_types.h
@@ -317,6 +317,9 @@ struct mm_struct {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
+#ifdef CONFIG_FORKBOMB_KILLER
+ struct mm_record *record;
+#endif
};
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
Index: mm-work/include/linux/oom.h
===================================================================
--- mm-work.orig/include/linux/oom.h
+++ mm-work/include/linux/oom.h
@@ -25,6 +25,43 @@
#include <linux/types.h>
#include <linux/nodemask.h>
+/*
+ * For tracking where the task comes from. This will be
+ * used by fork-bomb detection. This tracks tasks even after dead.
+ * This struct is per-mm, not per task.
+ */
+
+#ifdef CONFIG_FORKBOMB_KILLER
+struct mm_record {
+ spinlock_t lock;
+ struct mm_struct *mm; /* NULL if process is not alive. */
+ struct mm_record *parent;
+ struct list_head siblings;
+ struct list_head children;
+ /* A memo for fork-bomb detection */
+ unsigned int oom_score;
+ unsigned int oom_family;
+ unsigned long start_time;
+ char need_to_kill;
+};
+extern void record_mm(struct mm_struct *new, struct mm_struct *parent);
+extern void del_mm_record(struct mm_struct *mm);
+extern void mm_record_exec(struct mm_struct *mm, struct mm_struct *old);
+#else
+static inline
+void record_mm(struct mm_struct *new, struct mm_struct *parent)
+{
+}
+static inline void del_mm_record(struct mm_struct *mm)
+{
+}
+static inline mm_record_exec(struct mm_struct *new, struct mm_struct *old)
+{
+}
+#endif
+
+
+
struct zonelist;
struct notifier_block;
struct mem_cgroup;
Index: mm-work/kernel/fork.c
===================================================================
--- mm-work.orig/kernel/fork.c
+++ mm-work/kernel/fork.c
@@ -566,6 +566,7 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ del_mm_record(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
mmdrop(mm);
@@ -705,6 +706,7 @@ struct mm_struct *dup_mm(struct task_str
if (mm->binfmt && !try_module_get(mm->binfmt->module))
goto free_pt;
+ record_mm(mm, oldmm);
return mm;
Index: mm-work/mm/Kconfig
===================================================================
--- mm-work.orig/mm/Kconfig
+++ mm-work/mm/Kconfig
@@ -340,6 +340,19 @@ choice
benefit.
endchoice
+config FORKBOMB_KILLER
+ bool "Enable fork-bomb-killer, a serial killer in OOM"
+ default n
+ help
+ When forkbomb happens, it's hard to recover because the speed of
+ fork() is much faster than killing and OOM-Killer just kills a
+ child per a oom. This forkbomb-killer tries to detect there is
+ a forkbomb and kill it if find. Because this is based on heuristics,
+ this may kill a family of memory eating tasks which is not a bomb.
+ And this adds some overhead to track memory usage by bomb.
+ please set 'y' if you are a brave.
+
+
#
# UP and nommu archs use km based percpu allocator
#
Index: mm-work/mm/oom_kill.c
===================================================================
--- mm-work.orig/mm/oom_kill.c
+++ mm-work/mm/oom_kill.c
@@ -31,12 +31,213 @@
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
+#include <linux/cpu.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
+#ifdef CONFIG_FORKBOMB_KILLER
+struct mm_record init_rec = {
+ .lock = __SPIN_LOCK_UNLOCKED(init_rec.lock),
+ .siblings = LIST_HEAD_INIT(init_rec.siblings),
+ .children = LIST_HEAD_INIT(init_rec.children),
+};
+
+struct mm_record_info {
+ int scan_lock; /* set to 1 while someone scanning */
+};
+DEFINE_PER_CPU(struct mm_record_info, pcpu_rec_info);
+static DEFINE_MUTEX(oom_rec_scan_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(oom_rec_scan_waitq);
+
+/*
+ * When running scan, it's better to have lock to disable
+ * add/remove entry,...rather than lockless approach.
+ * We do this by per cpu count + mutex.
+ */
+
+static void mm_rec_lock(void)
+{
+ DEFINE_WAIT(wait);
+retry:
+ rcu_read_lock(); /* Using rcu just for synchronization. */
+ if (this_cpu_read(pcpu_rec_info.scan_lock)) {
+ prepare_to_wait(&oom_rec_scan_waitq,
+ &wait, TASK_UNINTERRUPTIBLE);
+ rcu_read_unlock();
+ if (this_cpu_read(pcpu_rec_info.scan_lock))
+ schedule();
+ finish_wait(&oom_rec_scan_waitq, &wait);
+ goto retry;
+ }
+}
+
+static void mm_rec_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+/* Only one scanner is allowed */
+static void mm_rec_scan_lock(void)
+{
+ int cpu;
+ mutex_lock(&oom_rec_scan_mutex);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ struct mm_record_info *info = &per_cpu(pcpu_rec_info, cpu);
+ info->scan_lock = 1;
+ }
+ put_online_cpus();
+ synchronize_rcu();
+}
+
+static void mm_rec_scan_unlock(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ struct mm_record_info *info = &per_cpu(pcpu_rec_info, cpu);
+ info->scan_lock = 0;
+ }
+ put_online_cpus();
+ wake_up_all(&oom_rec_scan_waitq);
+ mutex_unlock(&oom_rec_scan_mutex);
+}
+
+void record_mm(struct mm_struct *new, struct mm_struct *parent)
+{
+ struct mm_record *rec, *prec;
+
+ rec = kmalloc(sizeof(*rec), GFP_KERNEL);
+ if (!rec) {
+ new->record = NULL;
+ return;
+ }
+ spin_lock_init(&rec->lock);
+ INIT_LIST_HEAD(&rec->children);
+ rec->mm = new;
+ /* task can be freed before mm...then we just record pid. */
+ mm_rec_lock();
+ rec->start_time = jiffies;
+ if (parent)
+ prec = parent->record;
+ else
+ prec = NULL;
+ if (!prec)
+ prec = &init_rec;
+ new->record = rec;
+ rec->parent = prec; /* never cleared */
+
+ spin_lock(&prec->lock);
+ list_add_tail(&rec->siblings, &prec->children);
+ spin_unlock(&prec->lock);
+ mm_rec_unlock();
+ return;
+}
+
+void del_mm_record(struct mm_struct *mm)
+{
+ struct mm_record *rec = mm->record;
+ bool nochild = false;
+
+ if (!rec) /* happens after exec() */
+ return;
+ mm_rec_lock();
+ spin_lock(&rec->lock);
+ rec->mm = NULL;
+ if (list_empty(&rec->children))
+ nochild = true;
+ mm->record = NULL;
+ spin_unlock(&rec->lock);
+ while (nochild && rec != &init_rec) {
+ struct mm_record *prec;
+
+ nochild = false;
+ prec = rec->parent;
+ spin_lock(&prec->lock);
+ list_del(&rec->siblings);
+ if (prec->mm == NULL && list_empty(&prec->children))
+ nochild = true;
+ spin_unlock(&prec->lock);
+ kfree(rec);
+ rec = prec;
+ }
+ mm_rec_unlock();
+}
+
+void mm_record_exec(struct mm_struct *new, struct mm_struct *old)
+{
+ /*
+ * This means there is a redundant link at exec because
+ * "old" will be droppped after this.
+ * But this is required for handle vfork().
+ */
+ record_mm(new, old);
+}
+
+/* Because we have global scan lock, we need no lock at scaning. */
+static struct mm_record* __first_child(struct mm_record *p)
+{
+ if (list_empty(&p->children))
+ return NULL;
+ return list_first_entry(&p->children, struct mm_record, siblings);
+}
+
+static struct mm_record* __next_sibling(struct mm_record *p)
+{
+ if (p->siblings.next == &p->parent->children)
+ return NULL;
+ return list_first_entry(&p->siblings, struct mm_record, siblings);
+}
+
+static struct mm_record *first_deepest_child(struct mm_record *p)
+{
+ struct mm_record *tmp;
+
+ do {
+ tmp = __first_child(p);
+ if (!tmp)
+ return p;
+ p = tmp;
+ } while (1);
+}
+
+static struct mm_record *mm_record_scan_start(struct mm_record *rec)
+{
+ return first_deepest_child(rec);
+}
+
+static struct mm_record *mm_record_scan_next(struct mm_record *pos)
+{
+ struct mm_record *tmp;
+
+ tmp = __next_sibling(pos);
+ if (!tmp)
+ return pos->parent;
+ pos = tmp;
+ pos = first_deepest_child(pos);
+ return pos;
+}
+
+/*
+ * scan leaf children first and visit parent, ancestors.
+ * rcu_read_lock() must be held.
+ */
+#define for_each_mm_record(pos)\
+ for (pos = mm_record_scan_start(&init_rec);\
+ pos != &init_rec;\
+ pos = mm_record_scan_next(pos))
+
+#define for_each_mm_record_under(pos, root)\
+ for (pos = mm_record_scan_start(root);\
+ pos != root;\
+ pos = mm_record_scan_next(pos))
+
+#endif
+
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
Index: mm-work/fs/exec.c
===================================================================
--- mm-work.orig/fs/exec.c
+++ mm-work/fs/exec.c
@@ -801,6 +801,7 @@ static int exec_mmap(struct mm_struct *m
atomic_inc(&tsk->mm->oom_disable_count);
}
task_unlock(tsk);
+ mm_record_exec(mm, old_mm);
arch_pick_mmap_layout(mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 4+ messages in thread* [RFC][PATCH 2/3] forkbomb: forgetting useless information
2011-03-23 4:23 [RFC][PATCH 0/3] A forkbomb killer and mm tracking system KAMEZAWA Hiroyuki
2011-03-23 4:26 ` [RFC][PATCH 1/3] forkbomb: introduce mm recorder KAMEZAWA Hiroyuki
@ 2011-03-23 4:29 ` KAMEZAWA Hiroyuki
2011-03-23 4:30 ` [RFC][PATCH 3/3] forkbomb: forkbomb killer KAMEZAWA Hiroyuki
2 siblings, 0 replies; 4+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-03-23 4:29 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel, linux-mm, kosaki.motohiro, rientjes, akpm,
Oleg Nesterov, minchan.kim, avagin, kirill
This patch adds a control knob for mm_record, forkbomb tracking.
/sys/kernel/mm/oom/mm_record_enable
is for controlling enable/disable tracking of mm_struct.
/sys/kernel/mm/oom/mm_record_reset_interval_msecs
is for controlling aging of mm_record.
enough old mm_records are freed when
- nr_processes() doesn't increase.
- no kswapd run.
- no try_try_free_pages() run.
for 30secs (default).
Note: changes for Makefile is required for initcall for mm_kobj
should be called befor initcall for oom.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
Documentation/vm/forkbomb.txt | 53 ++++++++++++++
mm/Makefile | 4 -
mm/oom_kill.c | 150 ++++++++++++++++++++++++++++++++++++++++--
3 files changed, 201 insertions(+), 6 deletions(-)
Index: mm-work/mm/oom_kill.c
===================================================================
--- mm-work.orig/mm/oom_kill.c
+++ mm-work/mm/oom_kill.c
@@ -32,6 +32,7 @@
#include <linux/mempolicy.h>
#include <linux/security.h>
#include <linux/cpu.h>
+#include <linux/sysfs.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
@@ -51,6 +52,7 @@ struct mm_record_info {
DEFINE_PER_CPU(struct mm_record_info, pcpu_rec_info);
static DEFINE_MUTEX(oom_rec_scan_mutex);
static DECLARE_WAIT_QUEUE_HEAD(oom_rec_scan_waitq);
+int mm_tracking_enabled = 1;
/*
* When running scan, it's better to have lock to disable
@@ -111,6 +113,10 @@ void record_mm(struct mm_struct *new, st
{
struct mm_record *rec, *prec;
+ if (!mm_tracking_enabled) {
+ new->record = NULL;
+ return;
+ }
rec = kmalloc(sizeof(*rec), GFP_KERNEL);
if (!rec) {
new->record = NULL;
@@ -138,20 +144,23 @@ void record_mm(struct mm_struct *new, st
return;
}
-void del_mm_record(struct mm_struct *mm)
+static void __del_mm_record(struct mm_record *rec, bool scan)
{
- struct mm_record *rec = mm->record;
bool nochild = false;
+ struct mm_struct *mm;
if (!rec) /* happens after exec() */
return;
- mm_rec_lock();
+
spin_lock(&rec->lock);
+ mm = rec->mm;
rec->mm = NULL;
if (list_empty(&rec->children))
nochild = true;
- mm->record = NULL;
+ if (mm)
+ mm->record = NULL;
spin_unlock(&rec->lock);
+
while (nochild && rec != &init_rec) {
struct mm_record *prec;
@@ -164,12 +173,22 @@ void del_mm_record(struct mm_struct *mm)
spin_unlock(&prec->lock);
kfree(rec);
rec = prec;
+ if (scan)
+ break;
}
+}
+
+void del_mm_record(struct mm_struct *mm)
+{
+ mm_rec_lock();
+ __del_mm_record(mm->record, false);
mm_rec_unlock();
}
void mm_record_exec(struct mm_struct *new, struct mm_struct *old)
{
+ if (!mm_tracking_enabled)
+ return;
/*
* This means there is a redundant link at exec because
* "old" will be droppped after this.
@@ -236,6 +255,12 @@ static struct mm_record *mm_record_scan_
pos != root;\
pos = mm_record_scan_next(pos))
+#define for_each_mm_record_safe(pos, tmp)\
+ for (pos = mm_record_scan_start(&init_rec),\
+ tmp = mm_record_scan_next(pos);\
+ pos != &init_rec;\
+ pos = tmp, tmp = mm_record_scan_next(tmp))
+
#endif
#ifdef CONFIG_NUMA
@@ -962,3 +987,120 @@ void pagefault_out_of_memory(void)
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
+
+#ifdef CONFIG_FORKBOMB_KILLER
+
+static unsigned long reset_interval_jiffies = 30*HZ;
+unsigned long last_nr_procs;
+unsigned long last_pageout_run;
+unsigned long last_allocstall;
+static void reset_mm_tracking(struct work_struct *w);
+DECLARE_DELAYED_WORK(reset_mm_tracking_work, reset_mm_tracking);
+
+static void reset_mm_tracking(struct work_struct *w)
+{
+ struct mm_record *pos, *tmp;
+ unsigned long nr_procs;
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ bool forget = true;
+
+ nr_procs = nr_processes();
+ if (nr_procs > last_nr_procs)
+ forget = false;
+ last_nr_procs = nr_procs;
+
+ all_vm_events(events);
+ if (last_pageout_run != events[PAGEOUTRUN])
+ forget = false;
+ last_pageout_run = events[PAGEOUTRUN];
+ if (last_allocstall != events[ALLOCSTALL])
+ forget = false;
+ last_allocstall = events[ALLOCSTALL];
+
+ if (forget) {
+ unsigned long thresh = jiffies - reset_interval_jiffies;
+ mm_rec_scan_lock();
+ for_each_mm_record_safe(pos, tmp) {
+ if (time_before(pos->start_time, thresh))
+ __del_mm_record(pos, true);
+ }
+ mm_rec_scan_unlock();
+ }
+ schedule_delayed_work(&reset_mm_tracking_work, reset_interval_jiffies);
+ return;
+}
+
+
+
+#define OOM_ATTR(_name)\
+ static struct kobj_attribute _name##_attr =\
+ __ATTR(_name, 0644, _name##_show, _name##_store)
+
+
+static ssize_t mm_tracker_reset_interval_msecs_show(struct kobject *obj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u", jiffies_to_msecs(reset_interval_jiffies));
+}
+
+static ssize_t mm_tracker_reset_interval_msecs_store(struct kobject *obj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ reset_interval_jiffies = msecs_to_jiffies(msecs);
+ return count;
+}
+OOM_ATTR(mm_tracker_reset_interval_msecs);
+
+static ssize_t mm_tracker_enable_show(struct kobject *obj,
+ struct kobj_attribute *attr, char *buf)
+{
+ if (mm_tracking_enabled)
+ return sprintf(buf, "enabled");
+ return sprintf(buf, "disabled");
+}
+
+static ssize_t mm_tracker_enable_store(struct kobject *obj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ if (!memcmp("disable", buf, min(sizeof("disable")-1, count)))
+ mm_tracking_enabled = 0;
+ else if (!memcmp("enable", buf, min(sizeof("enable")-1, count)))
+ mm_tracking_enabled = 1;
+ else
+ return -EINVAL;
+ return count;
+}
+OOM_ATTR(mm_tracker_enable);
+
+static struct attribute *oom_attrs[] = {
+ &mm_tracker_reset_interval_msecs_attr.attr,
+ &mm_tracker_enable_attr.attr,
+ NULL,
+};
+
+static struct attribute_group oom_attr_group = {
+ .attrs = oom_attrs,
+ .name = "oom",
+};
+
+static int __init init_mm_record(void)
+{
+ int err = 0;
+
+#ifdef CONFIG_SYSFS
+ err = sysfs_create_group(mm_kobj, &oom_attr_group);
+ if (err)
+ printk("failed to register mm history tracking for oom \n");
+#endif
+ schedule_delayed_work(&reset_mm_tracking_work, reset_interval_jiffies);
+ return 0;
+}
+module_init(init_mm_record);
+#endif
Index: mm-work/mm/Makefile
===================================================================
--- mm-work.orig/mm/Makefile
+++ mm-work/mm/Makefile
@@ -7,11 +7,11 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
-obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y := mm_init.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o mmu_context.o percpu.o \
+ page_isolation.o mmu_context.o percpu.o \
$(mmu-y)
obj-y += init-mm.o
Index: mm-work/Documentation/vm/forkbomb.txt
===================================================================
--- /dev/null
+++ mm-work/Documentation/vm/forkbomb.txt
@@ -0,0 +1,53 @@
+mm_record and FORKBOMB_KILLER.
+
+1. Introduction
+
+There are several types of forkbomb. If forkbomb happens, it sometimes
+hard to kill all threads by hand (with pkill or some.) In bad case,
+we cannot catch all process tree image of the forkbomb because parent
+tasks may exit before children. So, killing forkbombs by the kernel
+will be helpful.
+
+(example)
+# forkbomb(){ forkbomb|forkbomb & } ; forkbomb
+
+The kerenl provides a FORKBOMB_KILLER which uses its own task tracking system,
+which can chase tree of dead process.
+
+2. mm_record
+
+mm_record is a TAG to track process-tree. This is allocated when a new
+mm_struct is in_use. This mm_record will creates a tree which is similar
+to process tree.
+
+system's workqueue will remove enough old mm_record when
+ - nr_processes() doesn't seem to be changing.
+ - no alloc stall happens.
+ - no kswapd runs.
+
+So, size of mm_record will be very small in an idle system. Once someone
+starts some work, all new tasks's process tree will be tracked by
+mm_record.
+
+3. forkbomb detection
+
+forkbomb killer will kill processes which is under mm_record which has
+the worst badness score of oom_kill. If number of threads are very small,
+it will be a work for oom-kill rather than forkbomb-killer and forkbomb
+killer will do nothing.
+
+4. controls
+
+/sys/kernel/mm/oom/mm_tracker_enable
+ If enabled, forkbomb killer and mm_tracking will be enabled.
+ Default is enabled.
+
+/sys/kernel/mm/oom/mm_tracker_reset_interval_msecs
+ This is an interface to control aging of mm_records. At each interval
+ specified by this value, system status will be checked and system
+ will forget enough old mm_records. default is 30000 (30sec)
+
+
+
+
+
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 4+ messages in thread* [RFC][PATCH 3/3] forkbomb: forkbomb killer
2011-03-23 4:23 [RFC][PATCH 0/3] A forkbomb killer and mm tracking system KAMEZAWA Hiroyuki
2011-03-23 4:26 ` [RFC][PATCH 1/3] forkbomb: introduce mm recorder KAMEZAWA Hiroyuki
2011-03-23 4:29 ` [RFC][PATCH 2/3] forkbomb: forgetting useless information KAMEZAWA Hiroyuki
@ 2011-03-23 4:30 ` KAMEZAWA Hiroyuki
2 siblings, 0 replies; 4+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-03-23 4:30 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki
Cc: linux-kernel, linux-mm, kosaki.motohiro, rientjes, akpm,
Oleg Nesterov, minchan.kim, avagin, kirill
A forkbomb killer.
This routine walks all mm_record from a child -> its parent direction
and calculated the score of badness of mm_record tree. And this will
select the worst mm_record tree, Send SIGKILL to all process in
mm_record tree.
mm_record of tasks are under aging system and this will not kill
tasks enough living long (in stable system).
Tested with
# forkbomb(){ forkbomb|forkbomb & } ; forkbomb
# make -j kernel
and other bombs.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
mm/oom_kill.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 122 insertions(+)
Index: mm-work/mm/oom_kill.c
===================================================================
--- mm-work.orig/mm/oom_kill.c
+++ mm-work/mm/oom_kill.c
@@ -570,6 +570,121 @@ static struct task_struct *select_bad_pr
return chosen;
}
+#ifdef CONFIG_FORKBOMB_KILLER
+atomic_t forkbomb_killing;
+bool nobomb = false;
+
+void clear_forkbomb_killing(struct work_struct *w)
+{
+ atomic_set(&forkbomb_killing, 0);
+ nobomb = false;
+}
+DECLARE_DELAYED_WORK(fork_bomb_work, clear_forkbomb_killing);
+
+void reset_forkbomb_killing(void)
+{
+ schedule_delayed_work(&fork_bomb_work, 10*HZ);
+}
+
+static void get_badness_score(struct mm_record *pos, struct mem_cgroup *mem,
+ const nodemask_t *nodemask, unsigned long totalpages)
+{
+ struct task_struct *task;
+
+ if (!pos->mm)
+ return;
+ /* task struct is freed by RCU and we;re under rcu_read_lock() */
+ task = pos->mm->owner;
+ if (task && !oom_unkillable_task(task, mem, nodemask))
+ pos->oom_score += oom_badness(task, mem, nodemask, totalpages);
+}
+
+static void propagate_oom_info(struct mm_record *pos)
+{
+ struct mm_record *ppos;
+
+ ppos = pos->parent;
+ if (ppos == &init_rec) /* deadlink by timeout */
+ return;
+ /* +1 means that the child is a burden of the parent */
+ if (pos->mm) {
+ ppos->oom_score += pos->oom_score + 1;
+ ppos->oom_family += pos->oom_family;
+ } else {
+ ppos->oom_score += pos->oom_score;
+ ppos->oom_family += pos->oom_family;
+ }
+}
+
+static bool fork_bomb_killer(unsigned long totalpages, struct mem_cgroup *mem,
+ const nodemask_t *nodemask)
+{
+ struct mm_record *pos, *bomb;
+ unsigned int max_score;
+ struct task_struct *p;
+
+ if (nobomb)
+ return false;
+
+ if (atomic_inc_return(&forkbomb_killing) != 1)
+ return true;
+ /* reset information */
+ mm_rec_scan_lock();
+ nobomb = false;
+ pr_err("forkbomb detection running....\n");
+ for_each_mm_record(pos) {
+ pos->oom_score = 0;
+ if (pos->mm)
+ pos->oom_family = 1;
+ pos->need_to_kill = 0;
+ }
+ max_score = 0;
+ bomb = NULL;
+ for_each_mm_record(pos) {
+ get_badness_score(pos, mem, nodemask, totalpages);
+ propagate_oom_info(pos);
+ if (pos->oom_score > max_score) {
+ bomb = pos;
+ max_score = pos->oom_score;
+ }
+ }
+ if (!bomb || bomb->oom_family < 10) {
+ mm_rec_scan_unlock();
+ nobomb = true;
+ reset_forkbomb_killing();
+ pr_err("no forkbomb found \n");
+ return false;
+ }
+
+ pr_err("Possible forkbomb. Killing _all_ doubtful tasks\n");
+ for_each_mm_record_under(pos, bomb) {
+ pos->need_to_kill = 1;
+ }
+ read_lock(&tasklist_lock);
+ for_each_process(p) {
+ if (!p->mm || oom_unkillable_task(p, mem, nodemask))
+ continue;
+ if (p->signal->oom_score_adj == -1000)
+ continue;
+ if (p->mm->record && p->mm->record->need_to_kill) {
+ pr_err("kill %d(%s)->%d\n", task_pid_nr(p),
+ p->comm, p->mm->record->oom_score);
+ force_sig(SIGKILL, p);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ mm_rec_scan_unlock();
+ reset_forkbomb_killing();
+ return true;
+}
+#else
+static bool fork_bomb_killer(unsigned long totalpages, struct mem_cgroup *mem,
+ nodemask_t *nodemask)
+{
+ return false;
+}
+#endif
+
/**
* dump_tasks - dump current memory state of all system tasks
* @mem: current's memory controller, if constrained
@@ -767,6 +882,9 @@ void mem_cgroup_out_of_memory(struct mem
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
+
+ if (fork_bomb_killer(limit, mem, NULL))
+ goto out;
read_lock(&tasklist_lock);
retry:
p = select_bad_process(&points, limit, mem, NULL);
@@ -930,6 +1048,10 @@ void out_of_memory(struct zonelist *zone
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+ if (!sysctl_oom_kill_allocating_task) {
+ if (fork_bomb_killer(totalpages, NULL, mpol_mask))
+ return;
+ }
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 4+ messages in thread