* [PATCH] mem notifications v3
@ 2007-12-24 20:32 Marcelo Tosatti
2007-12-25 3:47 ` KOSAKI Motohiro
0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-24 20:32 UTC (permalink / raw)
To: linux-mm; +Cc: KOSAKI Motohiro, Daniel Spång, Rik van Riel, Andrew Morton
Follows updated version of mem-notify.
This changes the notification point to happen whenever the VM moves an
anonymous page to the inactive list - this is a pretty good indication
that there are unused anonymous pages present which will be very likely
swapped out soon.
Since the notification happens at shrink_zone() which can be called very
often the wakeups are rate limited to 5 times per second (on each CPU).
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/Documentation/devices.txt
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/Documentation/devices.txt
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/Documentation/devices.txt
@@ -96,6 +96,7 @@ Your cooperation is appreciated.
11 = /dev/kmsg Writes to this come out as printk's
12 = /dev/oldmem Used by crashdump kernels to access
the memory of the kernel that crashed.
+ 13 = /dev/mem_notify Low memory notification.
1 block RAM disk
0 = /dev/ram0 First RAM disk
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/drivers/char/mem.c
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/drivers/char/mem.c
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/drivers/char/mem.c
@@ -34,6 +34,8 @@
# include <linux/efi.h>
#endif
+extern struct file_operations mem_notify_fops;
+
/*
* Architectures vary in how they handle caching for addresses
* outside of main memory.
@@ -854,6 +856,9 @@ static int memory_open(struct inode * in
filp->f_op = &oldmem_fops;
break;
#endif
+ case 13:
+ filp->f_op = &mem_notify_fops;
+ break;
default:
return -ENXIO;
}
@@ -886,6 +891,7 @@ static const struct {
#ifdef CONFIG_CRASH_DUMP
{12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
#endif
+ {13,"mem_notify", S_IRUGO, &mem_notify_fops},
};
static struct class *mem_class;
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/include/linux/swap.h
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/include/linux/swap.h
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/include/linux/swap.h
@@ -213,6 +213,9 @@ extern int shmem_unuse(swp_entry_t entry
extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
+/* linux/mm/mem_notify.c */
+void mem_notify_userspace(void);
+
#ifdef CONFIG_SWAP
/* linux/mm/page_io.c */
extern int swap_readpage(struct file *, struct page *);
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/Makefile
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/mm/Makefile
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o
page_alloc.o page-writeback.o pdflush.o \
readahead.o swap.o truncate.o vmscan.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o $(mmu-y)
+ page_isolation.o mem_notify.o $(mmu-y)
obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
obj-$(CONFIG_BOUNCE) += bounce.o
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/mem_notify.c
===================================================================
--- /dev/null
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/mem_notify.c
@@ -0,0 +1,80 @@
+/*
+ * Notify applications of memory pressure via /dev/mem_notify
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+
+static unsigned long mem_notify_status = 0;
+
+static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
+static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
+
+/* maximum 5 notifications per second per cpu */
+void mem_notify_userspace(void)
+{
+ unsigned long target;
+ unsigned long now = jiffies;
+
+ target = __get_cpu_var(last_mem_notify) + (HZ/5);
+
+ if (time_after(now, target)) {
+ __get_cpu_var(last_mem_notify) = now;
+ mem_notify_status = 1;
+ wake_up(&mem_wait);
+ }
+}
+
+static int mem_notify_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static int mem_notify_release(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
+{
+ unsigned int val = 0;
+
+ poll_wait(file, &mem_wait, wait);
+
+ if (mem_notify_status) {
+ struct zone *zone;
+ int pages_high, pages_free, pages_reserve;
+
+ mem_notify_status = 0;
+
+ /* check if its not a spurious/stale notification */
+ pages_high = pages_free = pages_reserve = 0;
+ for_each_zone(zone) {
+ if (!populated_zone(zone) || is_highmem(zone))
+ continue;
+ pages_high += zone->pages_high;
+ pages_free += zone_page_state(zone, NR_FREE_PAGES);
+ pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
+ }
+
+ if (pages_free < (pages_high+pages_reserve)*2)
+ val = POLLIN;
+ }
+
+ return val;
+}
+
+struct file_operations mem_notify_fops = {
+ .open = mem_notify_open,
+ .release = mem_notify_release,
+ .poll = mem_notify_poll,
+};
+EXPORT_SYMBOL(mem_notify_fops);
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/vmscan.c
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/mm/vmscan.c
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/vmscan.c
@@ -960,7 +960,7 @@ static inline int zone_is_near_oom(struc
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct scan_control *sc, int priority)
{
unsigned long pgmoved;
@@ -972,6 +972,7 @@ static void shrink_active_list(unsigned
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
+ bool inactivated_anon = 0;
if (sc->may_swap) {
long mapped_ratio;
@@ -1078,6 +1079,13 @@ force_reclaim_mapped:
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
page_referenced(page, 0, sc->mem_cgroup)) {
+ /* deal with the case where there is no
+ * swap but an anonymous page would be
+ * moved to the inactive list.
+ */
+ if (!total_swap_pages && reclaim_mapped &&
+ PageAnon(page))
+ inactivated_anon = 1;
list_add(&page->lru, &l_active);
continue;
}
@@ -1085,6 +1093,8 @@ force_reclaim_mapped:
list_add(&page->lru, &l_active);
continue;
}
+ if (PageAnon(page))
+ inactivated_anon = 1;
list_add(&page->lru, &l_inactive);
}
@@ -1146,6 +1156,7 @@ force_reclaim_mapped:
spin_unlock_irq(&zone->lru_lock);
pagevec_release(&pvec);
+ return inactivated_anon;
}
/*
@@ -1158,6 +1169,7 @@ static unsigned long shrink_zone(int pri
unsigned long nr_inactive;
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
+ bool inactivated_anon = 0;
/*
* Add one to `nr_to_scan' just to make sure that the kernel will
@@ -1184,7 +1196,8 @@ static unsigned long shrink_zone(int pri
nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
nr_active -= nr_to_scan;
- shrink_active_list(nr_to_scan, zone, sc, priority);
+ if (shrink_active_list(nr_to_scan, zone, sc, priority))
+ inactivated_anon = 1;
}
if (nr_inactive) {
@@ -1196,6 +1209,9 @@ static unsigned long shrink_zone(int pri
}
}
+ if (inactivated_anon)
+ mem_notify_userspace();
+
throttle_vm_writeout(sc->gfp_mask);
return nr_reclaimed;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] mem notifications v3
2007-12-24 20:32 [PATCH] mem notifications v3 Marcelo Tosatti
@ 2007-12-25 3:47 ` KOSAKI Motohiro
2007-12-25 4:56 ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 3:47 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
Hi
> +/* maximum 5 notifications per second per cpu */
> +void mem_notify_userspace(void)
> +{
> + unsigned long target;
> + unsigned long now = jiffies;
> +
> + target = __get_cpu_var(last_mem_notify) + (HZ/5);
> +
> + if (time_after(now, target)) {
> + __get_cpu_var(last_mem_notify) = now;
> + mem_notify_status = 1;
> + wake_up(&mem_wait);
> + }
> +}
Hmm,
unfotunately, wake_up() wake up all process.
because
1. poll method use poll_wait().
2. poll_wait() not add_wait_queue_exclusive() but add_wait_queue() is used.
3. wake_up() function wake up 1 task *and* queueud item by add_wait_queue().
Conclusion:
this code intention wakeup all process HZ/5 * #cpus times at high memory pressure.
it is too much.
BTW: I propose add to poll_wait_exclusive() in kernel ;-p
> + /* check if its not a spurious/stale notification */
> + pages_high = pages_free = pages_reserve = 0;
> + for_each_zone(zone) {
> + if (!populated_zone(zone) || is_highmem(zone))
> + continue;
i think highmem ignoreed is very good improvement from before version :-D
> + pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
Hmm...
may be, don't works well.
MAX_NR_ZONES determined at compile time and determined by distribution vendor.
but real highest zone is determined by box total memory.
ex.
CONFIG_HIGHMEM config on but total memory < 4GB.
CONFIG_DMA32 config on but total memory < 4GB.
> + if (pages_free < (pages_high+pages_reserve)*2)
> + val = POLLIN;
why do you choice fomula of (pages_high+pages_reserve)*2 ?
> -static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> +static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
> struct scan_control *sc, int priority)
unnecessary type change.
if directly call mem_notify_userspace() in shrink_active_list, works well too.
because notify rate control can implement by mem_notify_userspace() and mem_notify_poll().
last_mem_notify works better.
/kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [RFC] add poll_wait_exclusive() API
2007-12-25 3:47 ` KOSAKI Motohiro
@ 2007-12-25 4:56 ` KOSAKI Motohiro
2007-12-27 21:05 ` Marcelo Tosatti
2007-12-25 8:31 ` [PATCH] mem notifications v3 KOSAKI Motohiro
2007-12-27 20:13 ` [PATCH] mem notifications v3 Marcelo Tosatti
2 siblings, 1 reply; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 4:56 UTC (permalink / raw)
To: linux-mm
Cc: kosaki.motohiro, Marcelo Tosatti, Daniel Sp蚣g,
Rik van Riel, Andrew Morton
Hi
add item to wait queue exist 2 way, add_wait_queue() and add_wait_queue_exclusive().
but unfortunately, we only able to use poll_wait in poll method.
poll_wait_exclusive() works similar as add_wait_queue_exclusive()
caution:
this patch is compile test only.
my purpose is discussion only.
/kosaki
Index: b/fs/eventpoll.c
===================================================================
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -677,7 +677,7 @@ out_unlock:
* target file wakeup lists.
*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
- poll_table *pt)
+ poll_table *pt, int exclusive)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
@@ -686,7 +686,10 @@ static void ep_ptable_queue_proc(struct
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
Index: b/fs/select.c
===================================================================
--- a/fs/select.c
+++ b/fs/select.c
@@ -48,7 +48,7 @@ struct poll_table_page {
* poll table.
*/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p);
+ poll_table *p, int exclusive);
void poll_initwait(struct poll_wqueues *pwq)
{
@@ -117,7 +117,7 @@ static struct poll_table_entry *poll_get
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
- poll_table *p)
+ poll_table *p, int exclusive)
{
struct poll_table_entry *entry = poll_get_entry(p);
if (!entry)
@@ -126,7 +126,10 @@ static void __pollwait(struct file *filp
entry->filp = filp;
entry->wait_address = wait_address;
init_waitqueue_entry(&entry->wait, current);
- add_wait_queue(wait_address, &entry->wait);
+ if (exclusive)
+ add_wait_queue_exclusive(wait_address, &entry->wait);
+ else
+ add_wait_queue(wait_address, &entry->wait);
}
#define FDS_IN(fds, n) (fds->in + n)
Index: b/include/linux/poll.h
===================================================================
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -28,18 +28,26 @@ struct poll_table_struct;
/*
* structures and helpers for f_op->poll implementations
*/
-typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
+typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
+ struct poll_table_struct *, int exclusive);
typedef struct poll_table_struct {
poll_queue_proc qproc;
} poll_table;
-static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
+static inline void poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
{
if (p && wait_address)
- p->qproc(filp, wait_address, p);
+ p->qproc(filp, wait_address, p, 0);
}
+static inline void poll_wait_exclusive(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+ if (p && wait_address)
+ p->qproc(filp, wait_address, p, 1);
+}
+
+
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] mem notifications v3
2007-12-25 3:47 ` KOSAKI Motohiro
2007-12-25 4:56 ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
@ 2007-12-25 8:31 ` KOSAKI Motohiro
2007-12-25 10:31 ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
2007-12-25 10:31 ` [RFC][patch 2/2] " KOSAKI Motohiro
2007-12-27 20:13 ` [PATCH] mem notifications v3 Marcelo Tosatti
2 siblings, 2 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 8:31 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
[-- Attachment #1: Type: text/plain, Size: 16799 bytes --]
Hi Marcelo-san
I tested your patch.
but unfortunately it doesn't works so good on large system.
description:
--------------------------------------------------------------
test machine
CPU: Itanium2 x 4
MEM: 16GB(8GB node x 2 NUMA system)
SWAP: 2GB
test program:
mem_notify_test.c
see attachement
m.sh
--------------
$ cat m.sh
#!/bin/sh
num=${1:-1}
mem=${2:-1}
echo $num $mem
for i in `seq 1 $num`; do
./mem_notify_test -m $mem &
done
--------------------------------------
1. run >10000 process test
console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
console2# sh m.sh 12500
Wed Dec 26 02:00:14 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
3 0 0 1561 7 12213 0 0 35 268 12 203 1 3 95 1 0
Wed Dec 26 02:00:15 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 0 602 7 13025 0 0 35 268 12 203 1 3 95 1 0
!! here 7 sec soft lockup !!
Wed Dec 26 02:00:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
8 1 12 42 68 13427 0 0 35 268 82 206 1 3 95 1 0
Wed Dec 26 02:00:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
69 0 17 38 64 13438 0 0 35 268 93 207 1 3 95 1 0
Wed Dec 26 02:00:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
4955 3 21 36 61 13443 0 0 35 268 103 208 1 3 95 1 0
Wed Dec 26 02:00:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
4138 2 28 42 57 13438 0 0 35 268 113 209 1 3 95 1 0
Wed Dec 26 02:00:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
5466 1 41 48 514 12975 0 0 35 269 119 211 1 3 95 1 0
Wed Dec 26 02:00:27 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
6082 0 78 38 896 12596 0 0 35 270 124 214 1 3 95 1 0
Wed Dec 26 02:00:28 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 5 132 38 1209 12280 0 0 35 271 128 217 1 3 95 1 0
Wed Dec 26 02:00:29 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 169 38 1484 12003 0 0 35 272 132 219 1 3 95 1 0
Wed Dec 26 02:00:30 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 31 248 36 2651 10822 0 0 35 274 133 222 1 3 95 1 0
Wed Dec 26 02:00:32 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1729 3 323 42 3138 10325 0 0 35 277 134 227 1 3 95 1 0
Wed Dec 26 02:00:33 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 2 410 50 3473 9968 0 0 35 279 134 230 1 3 95 1 0
Wed Dec 26 02:00:34 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
574 4 487 52 3737 9690 0 0 35 281 135 234 1 3 95 1 0
Wed Dec 26 02:00:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
5203 0 490 42 3794 9641 0 0 35 281 135 236 1 3 95 1 0
Wed Dec 26 02:00:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2256 2 568 36 4025 9400 0 0 35 283 136 240 1 3 95 1 0
Wed Dec 26 02:00:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 3 631 55 4233 9167 0 0 35 285 136 243 1 3 95 1 0
Wed Dec 26 02:00:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
11 2 819 44 4675 8707 0 0 35 290 138 251 1 3 95 1 0
Wed Dec 26 02:00:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 4 876 48 4782 8588 0 0 35 292 138 254 1 3 95 1 0
Wed Dec 26 02:00:43 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 6 945 36 4923 8450 0 0 35 294 139 257 1 3 95 1 0
Wed Dec 26 02:00:44 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
859 6 1001 50 5030 8317 0 0 35 296 139 260 1 3 95 1 0
Wed Dec 26 02:00:46 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 2 1099 45 5190 8146 0 0 35 299 140 266 1 3 95 1 0
Wed Dec 26 02:00:47 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2516 3 1190 47 5314 8009 0 0 35 301 141 270 1 3 95 1 0
Wed Dec 26 02:00:48 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2141 3 1259 54 5406 7903 0 0 35 303 141 274 1 3 95 1 0
Wed Dec 26 02:00:49 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
3074 2 1314 44 5467 7844 0 0 35 305 141 277 1 3 95 1 0
Wed Dec 26 02:00:50 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 1 1314 45 5465 7840 0 0 35 305 142 278 1 3 95 1 0
Wed Dec 26 02:00:51 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 1313 44 5466 7840 0 0 35 305 142 278 1 3 95 1 0
!! thundering herd restoration after 30sec at swap out start !!
result: many swap out occured.
# pgrep mem_notify|wc -l
12193
about 300 process receive notify.
problem
o thundering herd occured multi times on and off.
o soft lockup occured.
o notify receive process too few.
o swap out occured
2. after test1, run file I/O
console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
console2# dd if=/dev/zero of=tmp bs=100M count=10
$ LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
Wed Dec 26 02:21:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 1615 51 6048 7235 0 0 34 281 158 265 1 3 95 1 0
Wed Dec 26 02:21:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 1615 51 6048 7235 0 0 34 281 158 265 1 3 95 1 0
Wed Dec 26 02:21:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 1615 51 6048 7235 0 0 34 281 158 265 1 3 95 1 0
Wed Dec 26 02:21:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 1615 52 6048 7235 0 0 34 281 158 265 1 3 95 1 0
Wed Dec 26 02:21:39 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
124 6 1683 35 6174 7121 0 0 34 282 159 267 1 3 95 1 0
Wed Dec 26 02:21:40 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1262 3 1738 53 6293 6982 0 0 34 284 159 270 1 3 95 1 0
Wed Dec 26 02:21:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1546 1 1794 52 6404 6870 0 0 34 285 159 272 1 3 95 1 0
Wed Dec 26 02:21:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 5 1857 36 6525 6762 0 0 34 287 160 275 1 3 95 1 0
Wed Dec 26 02:21:43 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
3761 7 1896 35 6571 6718 0 0 34 289 160 276 1 3 95 1 0
Wed Dec 26 02:21:44 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 6 1898 43 6623 6654 0 0 34 291 160 277 1 3 95 1 0
Wed Dec 26 02:21:45 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 9 1921 36 6670 6614 0 0 34 293 160 279 1 3 95 1 0
Wed Dec 26 02:21:46 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
710 4 1944 52 6689 6582 0 0 34 294 161 280 1 3 95 1 0
Wed Dec 26 02:21:47 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 9 1958 42 6731 6549 0 0 34 295 161 281 1 3 95 1 0
Wed Dec 26 02:21:48 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 6 1978 44 6782 6498 0 0 34 297 161 284 1 3 95 1 0
!! time leap 4 sec !!
Wed Dec 26 02:21:52 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 10 2014 47 6864 6414 0 0 34 301 162 289 1 3 95 1 0
Wed Dec 26 02:21:53 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 10 2016 38 6881 6407 0 0 34 303 162 290 1 3 95 1 0
Wed Dec 26 02:21:54 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 5 2020 45 6884 6399 0 0 34 303 162 291 1 3 95 1 0
Wed Dec 26 02:21:56 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 7 2039 43 6932 6359 0 0 34 303 162 295 1 3 95 1 0
Wed Dec 26 02:21:57 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 9 2047 36 6777 6529 0 0 34 306 162 297 1 3 95 1 0
Wed Dec 26 02:21:58 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1501 1 2047 88 6699 6569 0 0 34 307 163 301 1 3 95 1 0
Wed Dec 26 02:21:59 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 5 2047 39 6588 6733 0 0 34 307 164 302 1 3 95 1 0
Wed Dec 26 02:22:00 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 4 2047 42 6275 7035 0 0 34 308 164 303 1 3 95 1 0
Wed Dec 26 02:22:01 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 3 2047 41 6277 7036 0 0 34 310 164 303 1 3 95 1 0
Wed Dec 26 02:22:02 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 2047 42 6277 7036 0 0 34 310 164 303 1 3 95 1 0
Wed Dec 26 02:22:03 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 1 2047 44 6277 7035 0 0 34 313 164 303 1 3 95 1 0
Wed Dec 26 02:22:04 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 1 2047 46 6277 7035 0 0 34 314 164 303 1 3 95 1 0
Wed Dec 26 02:22:05 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 1 2047 46 6277 7035 0 0 34 316 164 303 1 3 95 1 0
Wed Dec 26 02:22:06 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 2047 51 6277 7035 0 0 34 316 164 302 1 3 95 1 0
Wed Dec 26 02:22:07 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 2047 54 6277 7035 0 0 34 316 164 302 1 3 95 1 0
some process receive notify and exit.
but too few, and it doesn't prevent swap out.
[kosaki@n3234224 mem_notify]$ pgrep mem_notify|wc -l
11992
-> about 200 process receive notify.
requirement is 1000 process(= 1GB / 1MB).
/kosaki
[-- Attachment #2: mem_notify_test.c --]
[-- Type: application/octet-stream, Size: 1773 bytes --]
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <sys/mman.h>
#include <pthread.h>
#include <unistd.h>
#include <string.h>
#define DEFAULT_ALLOC_SIZE (1)
long alloc_size = DEFAULT_ALLOC_SIZE;
int main(int argc, char** argv)
{
struct pollfd polltable;
int fd;
int err;
void* ptr = NULL;
int c;
long size;
while((c = getopt(argc, argv, "m:")) != -1){
switch(c){
case 'm':
alloc_size = atol(optarg);
break;
default:
break;
}
}
argc -= optind;
argv += optind;
fd = open("/dev/mem_notify", O_RDONLY);
if( fd < 0 ){
perror("open ");
exit(1);
}
printf("try %ld MB\n", alloc_size);
size = alloc_size * 1024 * 1024;
ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_POPULATE, 0, 0);
memset(ptr, 0, size);
polltable.fd = fd;
polltable.events = POLLIN;
err = poll(&polltable, 1, -1);
if( err < 0 ){
perror("poll ");
}
if(polltable.revents){
time_t atime;
int readbuf;
err = read(fd, &readbuf, sizeof(int));
printf("read %d\n", err);
atime = time(NULL);
printf("poll ret %x %s\n", polltable.revents, ctime(&atime));
exit(1);
}
printf("mem_notify exit\n");
exit(1);
}
^ permalink raw reply [flat|nested] 13+ messages in thread
* [RFC][patch 1/2] mem notifications v3 improvement for large system
2007-12-25 8:31 ` [PATCH] mem notifications v3 KOSAKI Motohiro
@ 2007-12-25 10:31 ` KOSAKI Motohiro
2007-12-27 21:04 ` Marcelo Tosatti
2007-12-25 10:31 ` [RFC][patch 2/2] " KOSAKI Motohiro
1 sibling, 1 reply; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 10:31 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
Hi
I tried resolve too few notification problem.
mem_notify_status global variable mean wakeup 1 process.
it is too few.
improvement step1:
- add read method and wake up all process.
1. run >10000 process test
console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
console2# sh m.sh 12500
result:
- wakeup all unoccur neither thundering herd nor soft lock-up.
- no swap out occured.
- but too much free ;-)
in my test-case, over 5GB freed.
Wed Dec 26 03:19:20 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
7 0 0 605 209 12778 0 0 143 11 1458 183 14 10 76 1 0
Wed Dec 26 03:19:21 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
6 0 0 2687 209 10769 0 0 142 11 1459 188 14 10 75 1 0
Wed Dec 26 03:19:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 0 4560 209 8968 0 0 142 11 1459 191 14 10 75 1 0
Wed Dec 26 03:19:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5857 209 7724 0 0 142 11 1457 192 14 10 75 1 0
Wed Dec 26 03:19:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5872 209 7724 0 0 141 11 1454 192 14 10 75 1 0
Wed Dec 26 03:19:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5884 209 7724 0 0 141 11 1451 192 14 10 75 1 0
Wed Dec 26 03:19:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5895 209 7724 0 0 140 11 1448 191 14 10 75 1 0
Wed Dec 26 03:19:27 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5904 209 7724 0 0 140 11 1445 191 14 10 75 1 0
Wed Dec 26 03:19:28 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5912 209 7724 0 0 140 11 1442 190 13 10 75 1 0
Wed Dec 26 03:19:29 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5920 209 7724 0 0 139 11 1439 190 13 10 75 1 0
Wed Dec 26 03:19:30 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 1 0 5929 209 7724 0 0 139 11 1436 189 13 10 75 1 0
Wed Dec 26 03:19:32 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5935 209 7724 0 0 139 11 1433 189 13 10 75 1 0
Wed Dec 26 03:19:33 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1 0 0 5940 209 7724 0 0 138 11 1430 188 13 10 75 1 0
Wed Dec 26 03:19:34 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 1 0 5948 209 7725 0 0 138 11 1427 188 13 10 75 1 0
Wed Dec 26 03:19:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 0 5676 209 8005 0 0 138 11 1425 188 13 10 75 1 0
Wed Dec 26 03:19:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 1 0 5676 209 8006 0 0 137 11 1422 188 13 10 75 1 0
Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
===================================================================
--- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
+++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
@@ -13,7 +13,11 @@
#include <linux/percpu.h>
#include <linux/timer.h>
-static unsigned long mem_notify_status = 0;
+struct mem_notify_file_info {
+ long last_event;
+};
+
+atomic_t mem_notify_event = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
@@ -28,53 +32,81 @@ void mem_notify_userspace(void)
if (time_after(now, target)) {
__get_cpu_var(last_mem_notify) = now;
- mem_notify_status = 1;
+ atomic_inc(&mem_notify_event);
wake_up(&mem_wait);
}
}
static int mem_notify_open(struct inode *inode, struct file *file)
{
- return 0;
+ struct mem_notify_file_info *ptr;
+ int err = 0;
+
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ if (!ptr) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ptr->last_event = atomic_read(&mem_notify_event);
+ file->private_data = ptr;
+
+out:
+ return err;
}
static int mem_notify_release(struct inode *inode, struct file *file)
{
+ kfree(file->private_data);
+
return 0;
}
static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
{
unsigned int val = 0;
+ struct zone *zone;
+ int pages_high, pages_free, pages_reserve;
+ struct mem_notify_file_info *file_info = file->private_data;
poll_wait(file, &mem_wait, wait);
- if (mem_notify_status) {
- struct zone *zone;
- int pages_high, pages_free, pages_reserve;
-
- mem_notify_status = 0;
-
- /* check if its not a spurious/stale notification */
- pages_high = pages_free = pages_reserve = 0;
- for_each_zone(zone) {
- if (!populated_zone(zone) || is_highmem(zone))
- continue;
- pages_high += zone->pages_high;
- pages_free += zone_page_state(zone, NR_FREE_PAGES);
- pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
- }
+ if (file_info->last_event == atomic_read(&mem_notify_event))
+ goto out;
- if (pages_free < (pages_high+pages_reserve)*2)
- val = POLLIN;
+ /* check if its not a spurious/stale notification */
+ pages_high = pages_free = pages_reserve = 0;
+ for_each_zone(zone) {
+ if (!populated_zone(zone) || is_highmem(zone))
+ continue;
+ pages_high += zone->pages_high;
+ pages_free += zone_page_state(zone, NR_FREE_PAGES);
+ pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
}
-
+
+ if (pages_free < (pages_high+pages_reserve)*2)
+ val = POLLIN;
+
+out:
return val;
}
+static ssize_t mem_notify_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mem_notify_file_info *file_info = file->private_data;
+ if (!file_info)
+ return -EINVAL;
+
+ file_info->last_event = atomic_read(&mem_notify_event);
+
+ return 0;
+}
+
struct file_operations mem_notify_fops = {
.open = mem_notify_open,
.release = mem_notify_release,
.poll = mem_notify_poll,
+ .read = mem_notify_read,
};
EXPORT_SYMBOL(mem_notify_fops);
/kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [RFC][patch 2/2] mem notifications v3 improvement for large system
2007-12-25 8:31 ` [PATCH] mem notifications v3 KOSAKI Motohiro
2007-12-25 10:31 ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
@ 2007-12-25 10:31 ` KOSAKI Motohiro
2007-12-25 10:41 ` KOSAKI Motohiro
2007-12-27 4:49 ` [RFC][patch] mem_notify more faster reduce load average KOSAKI Motohiro
1 sibling, 2 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 10:31 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
2nd improvement
- add wakeup rate control
1. run >10000 process test
console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
console2# sh m.sh 12500
result
- swap out unoccured.
- time leap unoccured.
- max runqueue shrink about 1/10.
- too much freed unoccured.
very good.
Wed Dec 26 04:23:10 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
4 0 0 4122 190 9890 0 0 207 15 297 113 17 6 75 2 0
Wed Dec 26 04:23:11 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
3 0 0 3038 190 10809 0 0 206 15 299 117 17 7 75 2 0
Wed Dec 26 04:23:12 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 0 2004 190 11687 0 0 206 15 301 120 17 7 75 2 0
Wed Dec 26 04:23:13 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 0 1009 190 12530 0 0 205 15 303 124 17 7 74 2 0
Wed Dec 26 04:23:14 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 0 69 190 13327 0 0 204 15 305 127 17 7 74 2 0
Wed Dec 26 04:23:15 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
1109 0 0 88 199 13294 0 0 203 15 404 297 17 7 74 2 0
Wed Dec 26 04:23:16 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
285 0 0 86 199 13295 0 0 203 15 404 541 17 7 74 2 0
Wed Dec 26 04:23:17 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
258 0 0 88 199 13294 0 0 202 15 404 779 17 7 74 2 0
Wed Dec 26 04:23:18 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
185 0 0 88 199 13294 0 0 201 15 403 1012 17 7 74 2 0
Wed Dec 26 04:23:19 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
454 0 0 87 199 13296 0 0 200 15 403 1240 17 7 74 2 0
Wed Dec 26 04:23:21 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
216 0 0 87 199 13295 0 0 200 15 403 1463 17 7 74 2 0
Wed Dec 26 04:23:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
402 0 0 87 199 13297 0 0 199 15 403 1681 17 7 74 2 0
Wed Dec 26 04:23:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
716 0 0 86 199 13293 0 0 198 15 403 1893 17 7 74 2 0
Wed Dec 26 04:23:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
131 0 0 86 199 13294 0 0 197 15 402 2101 17 7 74 2 0
Wed Dec 26 04:23:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
417 0 0 87 199 13294 0 0 197 14 402 2301 17 8 74 2 0
Wed Dec 26 04:23:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
42 0 0 87 199 13294 0 0 196 14 402 2502 17 8 74 2 0
Wed Dec 26 04:23:27 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
968 0 0 88 199 13291 0 0 195 14 402 2697 17 8 74 2 0
Wed Dec 26 04:23:28 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
335 0 0 86 199 13295 0 0 195 14 402 2887 17 8 74 2 0
Wed Dec 26 04:23:29 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
386 0 0 87 199 13293 0 0 194 14 401 3071 17 8 74 2 0
Wed Dec 26 04:23:30 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
658 0 0 89 199 13292 0 0 193 14 401 3254 17 8 74 2 0
Wed Dec 26 04:23:31 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
72 0 0 87 199 13295 0 0 192 14 401 3439 16 8 74 2 0
Wed Dec 26 04:23:32 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
697 0 0 86 199 13295 0 0 192 14 401 3612 16 8 74 2 0
Wed Dec 26 04:23:33 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
289 0 0 87 199 13293 0 0 191 14 400 3780 16 8 74 2 0
Wed Dec 26 04:23:34 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
633 0 0 87 199 13294 0 0 190 14 400 3944 16 8 74 2 0
Wed Dec 26 04:23:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
2 0 0 86 199 13295 0 0 190 14 400 4101 16 8 74 2 0
Wed Dec 26 04:23:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
94 1 0 88 199 13293 0 0 189 14 400 4253 16 8 74 2 0
Wed Dec 26 04:23:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
384 0 0 88 199 13293 0 0 188 14 400 4402 16 8 74 2 0
Wed Dec 26 04:23:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
256 0 0 86 199 13293 0 0 188 14 399 4546 16 8 74 2 0
Wed Dec 26 04:23:39 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 0 90 199 13288 0 0 187 14 399 4686 16 8 74 2 0
Wed Dec 26 04:23:40 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 0 0 90 199 13288 0 0 187 14 398 4822 16 8 74 2 0
Wed Dec 26 04:23:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
0 1 0 90 199 13288 0 0 186 14 398 4953 16 8 74 2 0
Wed Dec 26 04:23:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free inact active si so bi bo in cs us sy id wa st
289 0 0 91 199 13288 0 0 185 14 397 5077 16 8 74 2 0
$ quilt diff
Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
===================================================================
--- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
+++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
@@ -12,6 +12,9 @@
#include <linux/vmstat.h>
#include <linux/percpu.h>
#include <linux/timer.h>
+#include <linux/delay.h>
+
+#define MSLEEP_BONUS_SHIFT 4
struct mem_notify_file_info {
long last_event;
@@ -20,7 +23,9 @@ struct mem_notify_file_info {
atomic_t mem_notify_event = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
-static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
+static atomic_long_t last_mem_notify = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
+static atomic_long_t last_task_wakeup = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
+static atomic_t mem_notify_timeout_bonus = ATOMIC_INIT(0);
/* maximum 5 notifications per second per cpu */
void mem_notify_userspace(void)
@@ -28,10 +33,10 @@ void mem_notify_userspace(void)
unsigned long target;
unsigned long now = jiffies;
- target = __get_cpu_var(last_mem_notify) + (HZ/5);
+ target = atomic_long_read(&last_mem_notify) + (HZ/5);
if (time_after(now, target)) {
- __get_cpu_var(last_mem_notify) = now;
+ atomic_long_set(&last_mem_notify, now);
atomic_inc(&mem_notify_event);
wake_up(&mem_wait);
}
@@ -68,12 +73,35 @@ static unsigned int mem_notify_poll(stru
struct zone *zone;
int pages_high, pages_free, pages_reserve;
struct mem_notify_file_info *file_info = file->private_data;
+ unsigned long bonus;
+ unsigned long now;
+ unsigned long last;
poll_wait(file, &mem_wait, wait);
if (file_info->last_event == atomic_read(&mem_notify_event))
goto out;
+retry:
+ /* Ugly trick:
+ when too many task wakeup,
+ control function exit rate for prevent too much freed.
+ */
+ now = jiffies;
+ last = (unsigned long)atomic_long_read(&last_task_wakeup);
+ if (time_before_eq(now, last)) {
+ bonus = atomic_read(&mem_notify_timeout_bonus) >>
+ MSLEEP_BONUS_SHIFT;
+ msleep_interruptible(1 + bonus);
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ goto out;
+ atomic_inc(&mem_notify_timeout_bonus);
+ goto retry;
+ }
+ atomic_set(&mem_notify_timeout_bonus, 0);
+ atomic_long_set(&last_task_wakeup, now);
+
/* check if its not a spurious/stale notification */
pages_high = pages_free = pages_reserve = 0;
for_each_zone(zone) {
/kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC][patch 2/2] mem notifications v3 improvement for large system
2007-12-25 10:31 ` [RFC][patch 2/2] " KOSAKI Motohiro
@ 2007-12-25 10:41 ` KOSAKI Motohiro
2007-12-27 4:49 ` [RFC][patch] mem_notify more faster reduce load average KOSAKI Motohiro
1 sibling, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 10:41 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
2nd improvement
test2
2. after test1, run file I/O
console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
console2# dd if=/dev/zero of=tmp bs=100M count=10
result:
- swap out unoccured.
- cache increase about 1GB.
- anon freed about 1GB.
very good!
$ pgrep mem_notify|wc -l
11079
$ dd if=/dev/zero of=tmp1 bs=100M count=10
$ pgrep mem_notify|wc -l
10307
$ LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M ; done
Wed Dec 26 04:36:19 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 425 785 3145 5 4 89 1 0
Wed Dec 26 04:36:20 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 424 784 3142 5 4 89 1 0
Wed Dec 26 04:36:21 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 424 784 3139 5 4 89 1 0
Wed Dec 26 04:36:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 424 783 3136 5 4 89 1 0
Wed Dec 26 04:36:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 1 0 70 42 211 0 0 54 423 783 3133 5 4 89 1 0
Wed Dec 26 04:36:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 423 782 3130 5 4 89 1 0
Wed Dec 26 04:36:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 422 782 3128 5 4 89 1 0
Wed Dec 26 04:36:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 0 0 70 42 211 0 0 54 422 781 3125 5 4 89 1 0
Wed Dec 26 04:36:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
820 6 0 89 45 1052 0 0 53 482 1133 3466 5 5 89 1 0
Wed Dec 26 04:36:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
355 6 0 87 45 1124 0 0 53 497 1132 3521 5 5 89 1 0
Wed Dec 26 04:36:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
241 6 0 88 45 1188 0 0 53 512 1132 3576 5 5 89 1 0
Wed Dec 26 04:36:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 3 0 93 45 1208 0 0 53 529 1131 3632 5 5 89 1 0
Wed Dec 26 04:36:39 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
0 4 0 93 45 1208 0 0 53 545 1130 3687 5 5 89 1 0
Wed Dec 26 04:36:40 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
830 4 0 93 45 1208 0 0 53 560 1129 3741 5 5 89 2 0
Wed Dec 26 04:36:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
103 4 0 93 45 1208 0 0 53 575 1128 3794 5 5 89 2 0
Wed Dec 26 04:36:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
133 4 0 94 45 1208 0 0 53 587 1128 3846 5 5 89 2 0
Wed Dec 26 04:36:43 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
r b swpd free buff cache si so bi bo in cs us sy id wa st
59 4 0 97 45 1208 0 0 53 603 1127 3898 5 5 88 2 0
/kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [RFC][patch] mem_notify more faster reduce load average
2007-12-25 10:31 ` [RFC][patch 2/2] " KOSAKI Motohiro
2007-12-25 10:41 ` KOSAKI Motohiro
@ 2007-12-27 4:49 ` KOSAKI Motohiro
1 sibling, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-27 4:49 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
Hi, Marcelo-san
this patch is a bit improvement against my mem notifications large system patch.
original my patch is too slower reduce load average at after free memory increased.
this patch fixed it.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
===================================================================
--- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
+++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
@@ -20,7 +20,8 @@ struct mem_notify_file_info {
long last_event;
};
-atomic_t mem_notify_event = ATOMIC_INIT(0);
+static atomic_t mem_notify_event = ATOMIC_INIT(0);
+static atomic_t mem_notify_event_end = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
static atomic_long_t last_mem_notify = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
@@ -76,13 +77,18 @@ static unsigned int mem_notify_poll(stru
unsigned long bonus;
unsigned long now;
unsigned long last;
+ unsigned long event;
poll_wait(file, &mem_wait, wait);
- if (file_info->last_event == atomic_read(&mem_notify_event))
+retry:
+ event = atomic_read(&mem_notify_event);
+ if (event == file_info->last_event)
goto out;
-retry:
+ if (event == atomic_read(&mem_notify_event_end))
+ goto out;
+
/* Ugly trick:
when too many task wakeup,
control function exit rate for prevent too much freed.
@@ -114,6 +120,8 @@ retry:
if (pages_free < (pages_high+pages_reserve)*2)
val = POLLIN;
+ else
+ atomic_set(&mem_notify_event_end, event);
out:
- kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] mem notifications v3
2007-12-25 3:47 ` KOSAKI Motohiro
2007-12-25 4:56 ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
2007-12-25 8:31 ` [PATCH] mem notifications v3 KOSAKI Motohiro
@ 2007-12-27 20:13 ` Marcelo Tosatti
2007-12-28 1:44 ` KOSAKI Motohiro
2 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-27 20:13 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Marcelo Tosatti, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
Hi Kosaki,
On Tue, Dec 25, 2007 at 12:47:49PM +0900, KOSAKI Motohiro wrote:
> Hi
>
>
> > +/* maximum 5 notifications per second per cpu */
> > +void mem_notify_userspace(void)
> > +{
> > + unsigned long target;
> > + unsigned long now = jiffies;
> > +
> > + target = __get_cpu_var(last_mem_notify) + (HZ/5);
> > +
> > + if (time_after(now, target)) {
> > + __get_cpu_var(last_mem_notify) = now;
> > + mem_notify_status = 1;
> > + wake_up(&mem_wait);
> > + }
> > +}
>
> Hmm,
> unfotunately, wake_up() wake up all process.
> because
> 1. poll method use poll_wait().
> 2. poll_wait() not add_wait_queue_exclusive() but add_wait_queue() is used.
> 3. wake_up() function wake up 1 task *and* queueud item by add_wait_queue().
>
> Conclusion:
> this code intention wakeup all process HZ/5 * #cpus times at high memory pressure.
> it is too much.
>
>
> BTW: I propose add to poll_wait_exclusive() in kernel ;-p
>
>
> > + /* check if its not a spurious/stale notification */
> > + pages_high = pages_free = pages_reserve = 0;
> > + for_each_zone(zone) {
> > + if (!populated_zone(zone) || is_highmem(zone))
> > + continue;
>
> i think highmem ignoreed is very good improvement from before version :-D
>
>
> > + pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
>
> Hmm...
> may be, don't works well.
>
> MAX_NR_ZONES determined at compile time and determined by distribution vendor.
> but real highest zone is determined by box total memory.
>
> ex.
> CONFIG_HIGHMEM config on but total memory < 4GB.
> CONFIG_DMA32 config on but total memory < 4GB.
That is OK because the calculation of lowmem reserves will take into account
all zones (mm/page_alloc.c::setup_per_zone_lowmem_reserve).
But it might be better to use the precalculated totalreserve_pages instead.
>
> > + if (pages_free < (pages_high+pages_reserve)*2)
> > + val = POLLIN;
>
> why do you choice fomula of (pages_high+pages_reserve)*2 ?
Just to make sure its not sending a spurious notification in the case the system
has enough free memory already.
> > -static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > +static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > struct scan_control *sc, int priority)
>
> unnecessary type change.
> if directly call mem_notify_userspace() in shrink_active_list, works well too.
> because notify rate control can implement by mem_notify_userspace() and mem_notify_poll().
Yes, and doing that should also guarantee that the notification is sent
before swapout is performed (right now it sends the notification after
shrink_inactive_list(), which is performing swapout).
> last_mem_notify works better.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC][patch 1/2] mem notifications v3 improvement for large system
2007-12-25 10:31 ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
@ 2007-12-27 21:04 ` Marcelo Tosatti
2007-12-28 0:38 ` KOSAKI Motohiro
0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-27 21:04 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Marcelo Tosatti, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
On Tue, Dec 25, 2007 at 07:31:14PM +0900, KOSAKI Motohiro wrote:
> Hi
>
> I tried resolve too few notification problem.
>
> mem_notify_status global variable mean wakeup 1 process.
> it is too few.
>
> improvement step1:
> - add read method and wake up all process.
>
> 1. run >10000 process test
> console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
> console2# sh m.sh 12500
>
> result:
> - wakeup all unoccur neither thundering herd nor soft lock-up.
> - no swap out occured.
> - but too much free ;-)
> in my test-case, over 5GB freed.
>
>
> Wed Dec 26 03:19:20 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 7 0 0 605 209 12778 0 0 143 11 1458 183 14 10 76 1 0
> Wed Dec 26 03:19:21 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 6 0 0 2687 209 10769 0 0 142 11 1459 188 14 10 75 1 0
> Wed Dec 26 03:19:22 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 2 0 0 4560 209 8968 0 0 142 11 1459 191 14 10 75 1 0
> Wed Dec 26 03:19:23 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5857 209 7724 0 0 142 11 1457 192 14 10 75 1 0
> Wed Dec 26 03:19:24 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5872 209 7724 0 0 141 11 1454 192 14 10 75 1 0
> Wed Dec 26 03:19:25 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5884 209 7724 0 0 141 11 1451 192 14 10 75 1 0
> Wed Dec 26 03:19:26 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5895 209 7724 0 0 140 11 1448 191 14 10 75 1 0
> Wed Dec 26 03:19:27 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5904 209 7724 0 0 140 11 1445 191 14 10 75 1 0
> Wed Dec 26 03:19:28 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5912 209 7724 0 0 140 11 1442 190 13 10 75 1 0
> Wed Dec 26 03:19:29 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5920 209 7724 0 0 139 11 1439 190 13 10 75 1 0
> Wed Dec 26 03:19:30 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 1 0 5929 209 7724 0 0 139 11 1436 189 13 10 75 1 0
> Wed Dec 26 03:19:32 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5935 209 7724 0 0 139 11 1433 189 13 10 75 1 0
> Wed Dec 26 03:19:33 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 1 0 0 5940 209 7724 0 0 138 11 1430 188 13 10 75 1 0
> Wed Dec 26 03:19:34 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 2 1 0 5948 209 7725 0 0 138 11 1427 188 13 10 75 1 0
> Wed Dec 26 03:19:35 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 0 0 0 5676 209 8005 0 0 138 11 1425 188 13 10 75 1 0
> Wed Dec 26 03:19:36 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
> r b swpd free inact active si so bi bo in cs us sy id wa st
> 0 1 0 5676 209 8006 0 0 137 11 1422 188 13 10 75 1 0
>
>
> Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
> ===================================================================
> --- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
> +++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
> @@ -13,7 +13,11 @@
> #include <linux/percpu.h>
> #include <linux/timer.h>
>
> -static unsigned long mem_notify_status = 0;
> +struct mem_notify_file_info {
> + long last_event;
> +};
> +
> +atomic_t mem_notify_event = ATOMIC_INIT(0);
>
> static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
> static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
> @@ -28,53 +32,81 @@ void mem_notify_userspace(void)
>
> if (time_after(now, target)) {
> __get_cpu_var(last_mem_notify) = now;
> - mem_notify_status = 1;
> + atomic_inc(&mem_notify_event);
> wake_up(&mem_wait);
> }
> }
>
> static int mem_notify_open(struct inode *inode, struct file *file)
> {
> - return 0;
> + struct mem_notify_file_info *ptr;
> + int err = 0;
> +
> + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
> + if (!ptr) {
> + err = -ENOMEM;
> + goto out;
> + }
> +
> + ptr->last_event = atomic_read(&mem_notify_event);
> + file->private_data = ptr;
> +
> +out:
> + return err;
> }
>
> static int mem_notify_release(struct inode *inode, struct file *file)
> {
> + kfree(file->private_data);
> +
> return 0;
> }
>
> static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
> {
> unsigned int val = 0;
> + struct zone *zone;
> + int pages_high, pages_free, pages_reserve;
> + struct mem_notify_file_info *file_info = file->private_data;
>
> poll_wait(file, &mem_wait, wait);
>
> - if (mem_notify_status) {
> - struct zone *zone;
> - int pages_high, pages_free, pages_reserve;
> -
> - mem_notify_status = 0;
By clearing mem_notify_status you avoid other processes going
through mem_notify_poll() from having a spurious notification if
memory is not exhausted. So the real point of notification is the
call from shrink_zone() (or shrink_active_list()), not the pages <
(pages_high+pages_reserve)*2 formula.
So something like the following sounds better:
- have your poll_wait_exclusive() patch in place
- pass a "status" parameter to mem_notify_userspace() and have it clear
mem_notify_status in case status is zero, so to stop sending POLLIN to processes.
- call mem_notify_userspace(0) from mm/vmscan.c when ZONE_NORMAL reclaim_mapped
is false (that seems a good indication that VM is out of trouble).
- test for mem_notify_status in mem_notify_poll(), but do not clear it.
- at mem_notify_userspace(), use wake_up_nr(number of mem_notify users/10) (10
meaning a small percentage of registered users).
> -
> - /* check if its not a spurious/stale notification */
> - pages_high = pages_free = pages_reserve = 0;
> - for_each_zone(zone) {
> - if (!populated_zone(zone) || is_highmem(zone))
> - continue;
> - pages_high += zone->pages_high;
> - pages_free += zone_page_state(zone, NR_FREE_PAGES);
> - pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
> - }
> + if (file_info->last_event == atomic_read(&mem_notify_event))
> + goto out;
What exactly are you trying to deal with by using last_event?
>
> - if (pages_free < (pages_high+pages_reserve)*2)
> - val = POLLIN;
> + /* check if its not a spurious/stale notification */
> + pages_high = pages_free = pages_reserve = 0;
> + for_each_zone(zone) {
> + if (!populated_zone(zone) || is_highmem(zone))
> + continue;
> + pages_high += zone->pages_high;
> + pages_free += zone_page_state(zone, NR_FREE_PAGES);
> + pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
> }
> -
> +
> + if (pages_free < (pages_high+pages_reserve)*2)
> + val = POLLIN;
> +
> +out:
> return val;
> }
>
> +static ssize_t mem_notify_read(struct file *file, char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct mem_notify_file_info *file_info = file->private_data;
> + if (!file_info)
> + return -EINVAL;
> +
> + file_info->last_event = atomic_read(&mem_notify_event);
> +
> + return 0;
> +}
Your then using read() to affect poll() behavior. I don't like it.
> +
> struct file_operations mem_notify_fops = {
> .open = mem_notify_open,
> .release = mem_notify_release,
> .poll = mem_notify_poll,
> + .read = mem_notify_read,
> };
> EXPORT_SYMBOL(mem_notify_fops);
>
>
>
>
>
>
> /kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC] add poll_wait_exclusive() API
2007-12-25 4:56 ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
@ 2007-12-27 21:05 ` Marcelo Tosatti
0 siblings, 0 replies; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-27 21:05 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: linux-mm, Marcelo Tosatti, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
On Tue, Dec 25, 2007 at 01:56:24PM +0900, KOSAKI Motohiro wrote:
> Hi
>
> add item to wait queue exist 2 way, add_wait_queue() and add_wait_queue_exclusive().
> but unfortunately, we only able to use poll_wait in poll method.
>
> poll_wait_exclusive() works similar as add_wait_queue_exclusive()
>
>
> caution:
> this patch is compile test only.
> my purpose is discussion only.
Looks good.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [RFC][patch 1/2] mem notifications v3 improvement for large system
2007-12-27 21:04 ` Marcelo Tosatti
@ 2007-12-28 0:38 ` KOSAKI Motohiro
0 siblings, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-28 0:38 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
Hi Marcelo-san
thank you for your advice.
> So something like the following sounds better:
>
> - have your poll_wait_exclusive() patch in place
> - pass a "status" parameter to mem_notify_userspace() and have it clear
> mem_notify_status in case status is zero, so to stop sending POLLIN to processes.
> - call mem_notify_userspace(0) from mm/vmscan.c when ZONE_NORMAL reclaim_mapped
> is false (that seems a good indication that VM is out of trouble).
> - test for mem_notify_status in mem_notify_poll(), but do not clear it.
> - at mem_notify_userspace(), use wake_up_nr(number of mem_notify users/10) (10
> meaning a small percentage of registered users).
feel nice idea.
OK. I will try it about new year.
> > + if (file_info->last_event == atomic_read(&mem_notify_event))
> > + goto out;
>
> What exactly are you trying to deal with by using last_event?
to be honest, read() and last_event is daniel-san's idea.
it is part of sysfs code in his patch.
my patch intent the same behavior as his.
1. read() method is deletable if you dislike.
I will delete at next post :)
2. last_event is not deletable, it is important.
when storong and long memory pressure,
notification received process call poll() again after own cache freed
but before out of trouble.
at that point, the process shold not wakeup because already memory freed.
(in other word, poll shold return 0.)
- kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] mem notifications v3
2007-12-27 20:13 ` [PATCH] mem notifications v3 Marcelo Tosatti
@ 2007-12-28 1:44 ` KOSAKI Motohiro
0 siblings, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-28 1:44 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
Andrew Morton
Hi Marcelo-san
> > > + pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
> >
> > Hmm...
> > may be, don't works well.
> >
> > MAX_NR_ZONES determined at compile time and determined by distribution vendor.
> > but real highest zone is determined by box total memory.
>
> That is OK because the calculation of lowmem reserves will take into account
> all zones (mm/page_alloc.c::setup_per_zone_lowmem_reserve).
really?
sorry, I will check again.
> But it might be better to use the precalculated totalreserve_pages instead.
Hmm...
unfortunately, accumulate of all zone memory is incompatible to NUMA awareness.
please think again.
> > > + if (pages_free < (pages_high+pages_reserve)*2)
> > > + val = POLLIN;
> >
> > why do you choice fomula of (pages_high+pages_reserve)*2 ?
>
> Just to make sure its not sending a spurious notification in the case the system
> has enough free memory already.
Can I think "*2" is your experimental rule?
if so, I agree your experience.
> > > -static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > > +static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > > struct scan_control *sc, int priority)
> >
> > unnecessary type change.
> > if directly call mem_notify_userspace() in shrink_active_list, works well too.
> > because notify rate control can implement by mem_notify_userspace() and mem_notify_poll().
>
> Yes, and doing that should also guarantee that the notification is sent
> before swapout is performed (right now it sends the notification after
> shrink_inactive_list(), which is performing swapout).
Agreed.
- kosaki
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2007-12-28 1:44 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-12-24 20:32 [PATCH] mem notifications v3 Marcelo Tosatti
2007-12-25 3:47 ` KOSAKI Motohiro
2007-12-25 4:56 ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
2007-12-27 21:05 ` Marcelo Tosatti
2007-12-25 8:31 ` [PATCH] mem notifications v3 KOSAKI Motohiro
2007-12-25 10:31 ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
2007-12-27 21:04 ` Marcelo Tosatti
2007-12-28 0:38 ` KOSAKI Motohiro
2007-12-25 10:31 ` [RFC][patch 2/2] " KOSAKI Motohiro
2007-12-25 10:41 ` KOSAKI Motohiro
2007-12-27 4:49 ` [RFC][patch] mem_notify more faster reduce load average KOSAKI Motohiro
2007-12-27 20:13 ` [PATCH] mem notifications v3 Marcelo Tosatti
2007-12-28 1:44 ` KOSAKI Motohiro
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox