linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mem notifications v3
@ 2007-12-24 20:32 Marcelo Tosatti
  2007-12-25  3:47 ` KOSAKI Motohiro
  0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-24 20:32 UTC (permalink / raw)
  To: linux-mm; +Cc: KOSAKI Motohiro, Daniel Spång, Rik van Riel, Andrew Morton

Follows updated version of mem-notify.

This changes the notification point to happen whenever the VM moves an
anonymous page to the inactive list - this is a pretty good indication
that there are unused anonymous pages present which will be very likely
swapped out soon.

Since the notification happens at shrink_zone() which can be called very
often the wakeups are rate limited to 5 times per second (on each CPU).


Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/Documentation/devices.txt
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/Documentation/devices.txt
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/Documentation/devices.txt
@@ -96,6 +96,7 @@ Your cooperation is appreciated.
 		 11 = /dev/kmsg		Writes to this come out as printk's
 		 12 = /dev/oldmem	Used by crashdump kernels to access
 					the memory of the kernel that crashed.
+		 13 = /dev/mem_notify   Low memory notification.
 
   1 block	RAM disk
 		  0 = /dev/ram0		First RAM disk
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/drivers/char/mem.c
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/drivers/char/mem.c
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/drivers/char/mem.c
@@ -34,6 +34,8 @@
 # include <linux/efi.h>
 #endif
 
+extern struct file_operations mem_notify_fops;
+
 /*
  * Architectures vary in how they handle caching for addresses
  * outside of main memory.
@@ -854,6 +856,9 @@ static int memory_open(struct inode * in
 			filp->f_op = &oldmem_fops;
 			break;
 #endif
+		case 13:
+			filp->f_op = &mem_notify_fops;
+			break;
 		default:
 			return -ENXIO;
 	}
@@ -886,6 +891,7 @@ static const struct {
 #ifdef CONFIG_CRASH_DUMP
 	{12,"oldmem",    S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops},
 #endif
+	{13,"mem_notify", S_IRUGO, &mem_notify_fops},
 };
 
 static struct class *mem_class;
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/include/linux/swap.h
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/include/linux/swap.h
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/include/linux/swap.h
@@ -213,6 +213,9 @@ extern int shmem_unuse(swp_entry_t entry
 
 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
 
+/* linux/mm/mem_notify.c */
+void mem_notify_userspace(void);
+
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/Makefile
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/mm/Makefile
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/Makefile
@@ -11,7 +11,7 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o $(mmu-y)
+			   page_isolation.o mem_notify.o $(mmu-y)
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/mem_notify.c
===================================================================
--- /dev/null
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/mem_notify.c
@@ -0,0 +1,80 @@
+/*
+ * Notify applications of memory pressure via /dev/mem_notify
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+
+static unsigned long mem_notify_status = 0;
+
+static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
+static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
+
+/* maximum 5 notifications per second per cpu */
+void mem_notify_userspace(void)
+{
+	unsigned long target;
+	unsigned long now = jiffies;
+
+	target = __get_cpu_var(last_mem_notify) + (HZ/5);
+
+	if (time_after(now, target)) {
+		__get_cpu_var(last_mem_notify) = now;
+		mem_notify_status = 1;
+		wake_up(&mem_wait);
+	}
+}
+
+static int mem_notify_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int mem_notify_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
+{
+	unsigned int val = 0;
+
+	poll_wait(file, &mem_wait, wait);
+
+	if (mem_notify_status) {
+		struct zone *zone;
+		int pages_high, pages_free, pages_reserve;
+
+		mem_notify_status = 0;
+
+		/* check if its not a spurious/stale notification */
+		pages_high = pages_free = pages_reserve = 0;
+		for_each_zone(zone) { 
+			if (!populated_zone(zone) || is_highmem(zone))
+				continue;
+			pages_high += zone->pages_high;
+			pages_free += zone_page_state(zone, NR_FREE_PAGES);
+			pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
+		}
+
+		if (pages_free < (pages_high+pages_reserve)*2) 
+			val = POLLIN;
+	}
+		
+	return val;
+}
+
+struct file_operations mem_notify_fops = {
+	.open = mem_notify_open,
+	.release = mem_notify_release,
+	.poll = mem_notify_poll,
+};
+EXPORT_SYMBOL(mem_notify_fops);
Index: marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/vmscan.c
===================================================================
--- marcelo.orig/dev/mm/linux-2.6.24-rc2-mm1/mm/vmscan.c
+++ marcelo/dev/mm/linux-2.6.24-rc2-mm1/mm/vmscan.c
@@ -960,7 +960,7 @@ static inline int zone_is_near_oom(struc
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
-static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
 				struct scan_control *sc, int priority)
 {
 	unsigned long pgmoved;
@@ -972,6 +972,7 @@ static void shrink_active_list(unsigned 
 	struct page *page;
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
+	bool inactivated_anon = 0;
 
 	if (sc->may_swap) {
 		long mapped_ratio;
@@ -1078,6 +1079,13 @@ force_reclaim_mapped:
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
 			    page_referenced(page, 0, sc->mem_cgroup)) {
+				/* deal with the case where there is no 
+ 				 * swap but an anonymous page would be
+ 				 * moved to the inactive list.
+ 				 */
+				if (!total_swap_pages && reclaim_mapped &&
+				    PageAnon(page))
+					inactivated_anon = 1;
 				list_add(&page->lru, &l_active);
 				continue;
 			}
@@ -1085,6 +1093,8 @@ force_reclaim_mapped:
 			list_add(&page->lru, &l_active);
 			continue;
 		}
+		if (PageAnon(page))
+			inactivated_anon = 1;
 		list_add(&page->lru, &l_inactive);
 	}
 
@@ -1146,6 +1156,7 @@ force_reclaim_mapped:
 	spin_unlock_irq(&zone->lru_lock);
 
 	pagevec_release(&pvec);
+	return inactivated_anon;
 }
 
 /*
@@ -1158,6 +1169,7 @@ static unsigned long shrink_zone(int pri
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	bool inactivated_anon = 0;
 
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
@@ -1184,7 +1196,8 @@ static unsigned long shrink_zone(int pri
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
 			nr_active -= nr_to_scan;
-			shrink_active_list(nr_to_scan, zone, sc, priority);
+			if (shrink_active_list(nr_to_scan, zone, sc, priority))
+				inactivated_anon = 1;
 		}
 
 		if (nr_inactive) {
@@ -1196,6 +1209,9 @@ static unsigned long shrink_zone(int pri
 		}
 	}
 
+	if (inactivated_anon)
+		mem_notify_userspace();
+
 	throttle_vm_writeout(sc->gfp_mask);
 	return nr_reclaimed;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mem notifications v3
  2007-12-24 20:32 [PATCH] mem notifications v3 Marcelo Tosatti
@ 2007-12-25  3:47 ` KOSAKI Motohiro
  2007-12-25  4:56   ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
                     ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25  3:47 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

Hi


> +/* maximum 5 notifications per second per cpu */
> +void mem_notify_userspace(void)
> +{
> +	unsigned long target;
> +	unsigned long now = jiffies;
> +
> +	target = __get_cpu_var(last_mem_notify) + (HZ/5);
> +
> +	if (time_after(now, target)) {
> +		__get_cpu_var(last_mem_notify) = now;
> +		mem_notify_status = 1;
> +		wake_up(&mem_wait);
> +	}
> +}

Hmm,
unfotunately, wake_up() wake up all process.
because
 1. poll method use poll_wait().
 2. poll_wait() not add_wait_queue_exclusive() but add_wait_queue() is used. 
 3. wake_up() function wake up 1 task *and* queueud item by add_wait_queue().

Conclusion:
this code intention wakeup all process HZ/5 * #cpus times at high memory pressure.
it is too much.


BTW: I propose add to poll_wait_exclusive() in kernel ;-p


> +		/* check if its not a spurious/stale notification */
> +		pages_high = pages_free = pages_reserve = 0;
> +		for_each_zone(zone) { 
> +			if (!populated_zone(zone) || is_highmem(zone))
> +				continue;

i think highmem ignoreed is very good improvement from before version :-D


> +			pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];

Hmm...
may be, don't works well.

MAX_NR_ZONES determined at compile time and determined by distribution vendor.
but real highest zone is determined by box total memory.

ex.
CONFIG_HIGHMEM config on but total memory < 4GB.
CONFIG_DMA32 config on but total memory < 4GB.


> +		if (pages_free < (pages_high+pages_reserve)*2) 
> +			val = POLLIN;

why do you choice fomula of (pages_high+pages_reserve)*2 ?


> -static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> +static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
>  				struct scan_control *sc, int priority)

unnecessary type change.
if directly call mem_notify_userspace() in shrink_active_list, works well too.
because notify rate control can implement by mem_notify_userspace() and mem_notify_poll().

last_mem_notify works better.


/kosaki



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC] add poll_wait_exclusive() API
  2007-12-25  3:47 ` KOSAKI Motohiro
@ 2007-12-25  4:56   ` KOSAKI Motohiro
  2007-12-27 21:05     ` Marcelo Tosatti
  2007-12-25  8:31   ` [PATCH] mem notifications v3 KOSAKI Motohiro
  2007-12-27 20:13   ` [PATCH] mem notifications v3 Marcelo Tosatti
  2 siblings, 1 reply; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25  4:56 UTC (permalink / raw)
  To: linux-mm
  Cc: kosaki.motohiro, Marcelo Tosatti, Daniel Sp蚣g,
	Rik van Riel, Andrew Morton

Hi

add item to wait queue exist 2 way, add_wait_queue() and add_wait_queue_exclusive().
but unfortunately, we only able to use poll_wait in poll method.

poll_wait_exclusive() works similar as add_wait_queue_exclusive()


caution:
  this patch is compile test only.
  my purpose is discussion only.


/kosaki


Index: b/fs/eventpoll.c
===================================================================
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -677,7 +677,7 @@ out_unlock:
  * target file wakeup lists.
  */
 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
-				 poll_table *pt)
+				 poll_table *pt, int exclusive)
 {
 	struct epitem *epi = ep_item_from_epqueue(pt);
 	struct eppoll_entry *pwq;
@@ -686,7 +686,10 @@ static void ep_ptable_queue_proc(struct 
 		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
 		pwq->whead = whead;
 		pwq->base = epi;
-		add_wait_queue(whead, &pwq->wait);
+		if (exclusive)
+			add_wait_queue_exclusive(whead, &pwq->wait);
+		else
+			add_wait_queue(whead, &pwq->wait);
 		list_add_tail(&pwq->llink, &epi->pwqlist);
 		epi->nwait++;
 	} else {
Index: b/fs/select.c
===================================================================
--- a/fs/select.c
+++ b/fs/select.c
@@ -48,7 +48,7 @@ struct poll_table_page {
  * poll table.
  */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
-		       poll_table *p);
+		       poll_table *p, int exclusive);
 
 void poll_initwait(struct poll_wqueues *pwq)
 {
@@ -117,7 +117,7 @@ static struct poll_table_entry *poll_get
 
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
-				poll_table *p)
+		       poll_table *p, int exclusive)
 {
 	struct poll_table_entry *entry = poll_get_entry(p);
 	if (!entry)
@@ -126,7 +126,10 @@ static void __pollwait(struct file *filp
 	entry->filp = filp;
 	entry->wait_address = wait_address;
 	init_waitqueue_entry(&entry->wait, current);
-	add_wait_queue(wait_address, &entry->wait);
+	if (exclusive)
+		add_wait_queue_exclusive(wait_address, &entry->wait);
+	else
+		add_wait_queue(wait_address, &entry->wait);
 }
 
 #define FDS_IN(fds, n)		(fds->in + n)
Index: b/include/linux/poll.h
===================================================================
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -28,18 +28,26 @@ struct poll_table_struct;
 /* 
  * structures and helpers for f_op->poll implementations
  */
-typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
+typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
+				struct poll_table_struct *, int exclusive);
 
 typedef struct poll_table_struct {
 	poll_queue_proc qproc;
 } poll_table;
 
-static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
+static inline void poll_wait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
 {
 	if (p && wait_address)
-		p->qproc(filp, wait_address, p);
+		p->qproc(filp, wait_address, p, 0);
 }
 
+static inline void poll_wait_exclusive(struct file *filp, wait_queue_head_t *wait_address, poll_table *p)
+{
+	if (p && wait_address)
+		p->qproc(filp, wait_address, p, 1);
+}
+
+
 static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 {
 	pt->qproc = qproc;



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mem notifications v3
  2007-12-25  3:47 ` KOSAKI Motohiro
  2007-12-25  4:56   ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
@ 2007-12-25  8:31   ` KOSAKI Motohiro
  2007-12-25 10:31     ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
  2007-12-25 10:31     ` [RFC][patch 2/2] " KOSAKI Motohiro
  2007-12-27 20:13   ` [PATCH] mem notifications v3 Marcelo Tosatti
  2 siblings, 2 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25  8:31 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 16799 bytes --]

Hi Marcelo-san

I tested your patch.
but unfortunately it doesn't works so good on large system.

description:
--------------------------------------------------------------
test machine
  CPU: Itanium2 x 4
  MEM: 16GB(8GB node x 2 NUMA system)
  SWAP: 2GB

test program:
  mem_notify_test.c
     see attachement
  m.sh
  --------------
$ cat m.sh
#!/bin/sh

num=${1:-1}
mem=${2:-1}

echo $num $mem

for i in `seq 1 $num`; do
    ./mem_notify_test -m $mem &
done
--------------------------------------

1. run >10000 process test
   console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
   console2# sh m.sh 12500


Wed Dec 26 02:00:14 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 3  0      0   1561      7  12213    0    0    35   268   12  203  1  3 95  1  0
Wed Dec 26 02:00:15 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0      0    602      7  13025    0    0    35   268   12  203  1  3 95  1  0

   !! here 7 sec soft lockup !!
Wed Dec 26 02:00:22 JST 2007   
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 8  1     12     42     68  13427    0    0    35   268   82  206  1  3 95  1  0
Wed Dec 26 02:00:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
69  0     17     38     64  13438    0    0    35   268   93  207  1  3 95  1  0
Wed Dec 26 02:00:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
4955  3     21     36     61  13443    0    0    35   268  103  208  1  3 95  1  0
Wed Dec 26 02:00:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
4138  2     28     42     57  13438    0    0    35   268  113  209  1  3 95  1  0
Wed Dec 26 02:00:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
5466  1     41     48    514  12975    0    0    35   269  119  211  1  3 95  1  0
Wed Dec 26 02:00:27 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
6082  0     78     38    896  12596    0    0    35   270  124  214  1  3 95  1  0
Wed Dec 26 02:00:28 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  5    132     38   1209  12280    0    0    35   271  128  217  1  3 95  1  0
Wed Dec 26 02:00:29 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0    169     38   1484  12003    0    0    35   272  132  219  1  3 95  1  0
Wed Dec 26 02:00:30 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0 31    248     36   2651  10822    0    0    35   274  133  222  1  3 95  1  0
Wed Dec 26 02:00:32 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
1729  3    323     42   3138  10325    0    0    35   277  134  227  1  3 95  1  0
Wed Dec 26 02:00:33 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  2    410     50   3473   9968    0    0    35   279  134  230  1  3 95  1  0
Wed Dec 26 02:00:34 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
574  4    487     52   3737   9690    0    0    35   281  135  234  1  3 95  1  0
Wed Dec 26 02:00:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
5203  0    490     42   3794   9641    0    0    35   281  135  236  1  3 95  1  0
Wed Dec 26 02:00:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
2256  2    568     36   4025   9400    0    0    35   283  136  240  1  3 95  1  0
Wed Dec 26 02:00:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  3    631     55   4233   9167    0    0    35   285  136  243  1  3 95  1  0
Wed Dec 26 02:00:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
11  2    819     44   4675   8707    0    0    35   290  138  251  1  3 95  1  0
Wed Dec 26 02:00:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  4    876     48   4782   8588    0    0    35   292  138  254  1  3 95  1  0
Wed Dec 26 02:00:43 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  6    945     36   4923   8450    0    0    35   294  139  257  1  3 95  1  0
Wed Dec 26 02:00:44 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
859  6   1001     50   5030   8317    0    0    35   296  139  260  1  3 95  1  0
Wed Dec 26 02:00:46 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  2   1099     45   5190   8146    0    0    35   299  140  266  1  3 95  1  0
Wed Dec 26 02:00:47 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
2516  3   1190     47   5314   8009    0    0    35   301  141  270  1  3 95  1  0
Wed Dec 26 02:00:48 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
2141  3   1259     54   5406   7903    0    0    35   303  141  274  1  3 95  1  0
Wed Dec 26 02:00:49 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
3074  2   1314     44   5467   7844    0    0    35   305  141  277  1  3 95  1  0
Wed Dec 26 02:00:50 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  1   1314     45   5465   7840    0    0    35   305  142  278  1  3 95  1  0
Wed Dec 26 02:00:51 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   1313     44   5466   7840    0    0    35   305  142  278  1  3 95  1  0

!! thundering herd restoration after 30sec at swap out start !!
   result: many swap out occured.

# pgrep mem_notify|wc -l
12193

about 300 process receive notify.

problem
  o thundering herd occured multi times on and off.
  o soft lockup occured.
  o notify receive process too few.
  o swap out occured
  

2. after test1, run file I/O
   console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
   console2# dd if=/dev/zero of=tmp bs=100M count=10


$ LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
Wed Dec 26 02:21:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   1615     51   6048   7235    0    0    34   281  158  265  1  3 95  1  0
Wed Dec 26 02:21:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   1615     51   6048   7235    0    0    34   281  158  265  1  3 95  1  0
Wed Dec 26 02:21:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   1615     51   6048   7235    0    0    34   281  158  265  1  3 95  1  0
Wed Dec 26 02:21:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   1615     52   6048   7235    0    0    34   281  158  265  1  3 95  1  0
Wed Dec 26 02:21:39 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
124  6   1683     35   6174   7121    0    0    34   282  159  267  1  3 95  1  0
Wed Dec 26 02:21:40 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
1262  3   1738     53   6293   6982    0    0    34   284  159  270  1  3 95  1  0
Wed Dec 26 02:21:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
1546  1   1794     52   6404   6870    0    0    34   285  159  272  1  3 95  1  0
Wed Dec 26 02:21:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  5   1857     36   6525   6762    0    0    34   287  160  275  1  3 95  1  0
Wed Dec 26 02:21:43 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
3761  7   1896     35   6571   6718    0    0    34   289  160  276  1  3 95  1  0
Wed Dec 26 02:21:44 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  6   1898     43   6623   6654    0    0    34   291  160  277  1  3 95  1  0
Wed Dec 26 02:21:45 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  9   1921     36   6670   6614    0    0    34   293  160  279  1  3 95  1  0
Wed Dec 26 02:21:46 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
710  4   1944     52   6689   6582    0    0    34   294  161  280  1  3 95  1  0
Wed Dec 26 02:21:47 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  9   1958     42   6731   6549    0    0    34   295  161  281  1  3 95  1  0
Wed Dec 26 02:21:48 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  6   1978     44   6782   6498    0    0    34   297  161  284  1  3 95  1  0

!! time leap 4 sec !!
Wed Dec 26 02:21:52 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0 10   2014     47   6864   6414    0    0    34   301  162  289  1  3 95  1  0
Wed Dec 26 02:21:53 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0 10   2016     38   6881   6407    0    0    34   303  162  290  1  3 95  1  0
Wed Dec 26 02:21:54 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  5   2020     45   6884   6399    0    0    34   303  162  291  1  3 95  1  0
Wed Dec 26 02:21:56 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  7   2039     43   6932   6359    0    0    34   303  162  295  1  3 95  1  0
Wed Dec 26 02:21:57 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  9   2047     36   6777   6529    0    0    34   306  162  297  1  3 95  1  0
Wed Dec 26 02:21:58 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
1501  1   2047     88   6699   6569    0    0    34   307  163  301  1  3 95  1  0
Wed Dec 26 02:21:59 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  5   2047     39   6588   6733    0    0    34   307  164  302  1  3 95  1  0
Wed Dec 26 02:22:00 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  4   2047     42   6275   7035    0    0    34   308  164  303  1  3 95  1  0
Wed Dec 26 02:22:01 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  3   2047     41   6277   7036    0    0    34   310  164  303  1  3 95  1  0
Wed Dec 26 02:22:02 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   2047     42   6277   7036    0    0    34   310  164  303  1  3 95  1  0
Wed Dec 26 02:22:03 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  1   2047     44   6277   7035    0    0    34   313  164  303  1  3 95  1  0
Wed Dec 26 02:22:04 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  1   2047     46   6277   7035    0    0    34   314  164  303  1  3 95  1  0
Wed Dec 26 02:22:05 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  1   2047     46   6277   7035    0    0    34   316  164  303  1  3 95  1  0
Wed Dec 26 02:22:06 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0   2047     51   6277   7035    0    0    34   316  164  302  1  3 95  1  0
Wed Dec 26 02:22:07 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0   2047     54   6277   7035    0    0    34   316  164  302  1  3 95  1  0

some process receive notify and exit.
but too few, and it doesn't prevent swap out.

[kosaki@n3234224 mem_notify]$ pgrep mem_notify|wc -l
11992

   -> about 200 process receive notify.
      requirement is 1000 process(= 1GB / 1MB).


/kosaki

[-- Attachment #2: mem_notify_test.c --]
[-- Type: application/octet-stream, Size: 1773 bytes --]

#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <sys/mman.h>
#include <pthread.h>
#include <unistd.h>
#include <string.h>

#define DEFAULT_ALLOC_SIZE (1)
long alloc_size = DEFAULT_ALLOC_SIZE;

int main(int argc, char** argv)
{
        struct pollfd polltable;
        int fd;
        int err;
        void* ptr = NULL;
        int c;
        long size;

        while((c = getopt(argc, argv, "m:")) != -1){
                switch(c){
                case 'm':
                        alloc_size = atol(optarg);
                        break;
                default:
                        break;
                }
        }
        argc -= optind;
        argv += optind;

        fd = open("/dev/mem_notify", O_RDONLY);
        if( fd < 0 ){
                perror("open ");
                exit(1);
        }

        printf("try %ld MB\n", alloc_size);
        size = alloc_size * 1024 * 1024;
        ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_POPULATE, 0, 0);
        memset(ptr, 0, size);


        polltable.fd = fd;
        polltable.events = POLLIN;
        err = poll(&polltable, 1, -1);
        if( err < 0 ){
                perror("poll ");
        }
        if(polltable.revents){
                time_t atime;
                int readbuf;

                err = read(fd, &readbuf, sizeof(int));
                printf("read %d\n", err);

                atime = time(NULL);
                printf("poll ret %x %s\n", polltable.revents, ctime(&atime));
                exit(1);
        }

        printf("mem_notify exit\n");
        exit(1);
}

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC][patch 1/2] mem notifications v3 improvement for large system
  2007-12-25  8:31   ` [PATCH] mem notifications v3 KOSAKI Motohiro
@ 2007-12-25 10:31     ` KOSAKI Motohiro
  2007-12-27 21:04       ` Marcelo Tosatti
  2007-12-25 10:31     ` [RFC][patch 2/2] " KOSAKI Motohiro
  1 sibling, 1 reply; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 10:31 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

Hi

I tried resolve too few notification problem.

mem_notify_status global variable mean wakeup 1 process.
it is too few.

improvement step1:
- add read method and wake up all process.

1. run >10000 process test
   console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
   console2# sh m.sh 12500

result:
 - wakeup all unoccur neither thundering herd nor soft lock-up.
 - no swap out occured.
 - but too much free ;-)
   in my test-case, over 5GB freed.


Wed Dec 26 03:19:20 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 7  0      0    605    209  12778    0    0   143    11 1458  183 14 10 76  1  0
Wed Dec 26 03:19:21 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 6  0      0   2687    209  10769    0    0   142    11 1459  188 14 10 75  1  0
Wed Dec 26 03:19:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0      0   4560    209   8968    0    0   142    11 1459  191 14 10 75  1  0
Wed Dec 26 03:19:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5857    209   7724    0    0   142    11 1457  192 14 10 75  1  0
Wed Dec 26 03:19:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5872    209   7724    0    0   141    11 1454  192 14 10 75  1  0
Wed Dec 26 03:19:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5884    209   7724    0    0   141    11 1451  192 14 10 75  1  0
Wed Dec 26 03:19:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5895    209   7724    0    0   140    11 1448  191 14 10 75  1  0
Wed Dec 26 03:19:27 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5904    209   7724    0    0   140    11 1445  191 14 10 75  1  0
Wed Dec 26 03:19:28 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5912    209   7724    0    0   140    11 1442  190 13 10 75  1  0
Wed Dec 26 03:19:29 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5920    209   7724    0    0   139    11 1439  190 13 10 75  1  0
Wed Dec 26 03:19:30 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  1      0   5929    209   7724    0    0   139    11 1436  189 13 10 75  1  0
Wed Dec 26 03:19:32 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5935    209   7724    0    0   139    11 1433  189 13 10 75  1  0
Wed Dec 26 03:19:33 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 1  0      0   5940    209   7724    0    0   138    11 1430  188 13 10 75  1  0
Wed Dec 26 03:19:34 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  1      0   5948    209   7725    0    0   138    11 1427  188 13 10 75  1  0
Wed Dec 26 03:19:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0      0   5676    209   8005    0    0   138    11 1425  188 13 10 75  1  0
Wed Dec 26 03:19:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  1      0   5676    209   8006    0    0   137    11 1422  188 13 10 75  1  0


Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
===================================================================
--- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
+++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
@@ -13,7 +13,11 @@
 #include <linux/percpu.h>
 #include <linux/timer.h>

-static unsigned long mem_notify_status = 0;
+struct mem_notify_file_info {
+        long          last_event;
+};
+
+atomic_t mem_notify_event = ATOMIC_INIT(0);

 static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
 static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
@@ -28,53 +32,81 @@ void mem_notify_userspace(void)

        if (time_after(now, target)) {
                __get_cpu_var(last_mem_notify) = now;
-               mem_notify_status = 1;
+               atomic_inc(&mem_notify_event);
                wake_up(&mem_wait);
        }
 }

 static int mem_notify_open(struct inode *inode, struct file *file)
 {
-       return 0;
+        struct mem_notify_file_info *ptr;
+        int    err = 0;
+
+        ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+        if (!ptr) {
+                err = -ENOMEM;
+                goto out;
+        }
+
+        ptr->last_event = atomic_read(&mem_notify_event);
+        file->private_data = ptr;
+
+out:
+        return err;
 }

 static int mem_notify_release(struct inode *inode, struct file *file)
 {
+        kfree(file->private_data);
+
        return 0;
 }

 static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
 {
        unsigned int val = 0;
+       struct zone *zone;
+       int pages_high, pages_free, pages_reserve;
+        struct mem_notify_file_info *file_info = file->private_data;

        poll_wait(file, &mem_wait, wait);

-       if (mem_notify_status) {
-               struct zone *zone;
-               int pages_high, pages_free, pages_reserve;
-
-               mem_notify_status = 0;
-
-               /* check if its not a spurious/stale notification */
-               pages_high = pages_free = pages_reserve = 0;
-               for_each_zone(zone) {
-                       if (!populated_zone(zone) || is_highmem(zone))
-                               continue;
-                       pages_high += zone->pages_high;
-                       pages_free += zone_page_state(zone, NR_FREE_PAGES);
-                       pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
-               }
+        if (file_info->last_event == atomic_read(&mem_notify_event))
+                goto out;

-               if (pages_free < (pages_high+pages_reserve)*2)
-                       val = POLLIN;
+       /* check if its not a spurious/stale notification */
+       pages_high = pages_free = pages_reserve = 0;
+       for_each_zone(zone) {
+               if (!populated_zone(zone) || is_highmem(zone))
+                       continue;
+               pages_high += zone->pages_high;
+               pages_free += zone_page_state(zone, NR_FREE_PAGES);
+               pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
        }
-
+
+       if (pages_free < (pages_high+pages_reserve)*2)
+               val = POLLIN;
+
+out:
        return val;
 }

+static ssize_t mem_notify_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+        struct mem_notify_file_info *file_info = file->private_data;
+        if (!file_info)
+                return -EINVAL;
+
+        file_info->last_event = atomic_read(&mem_notify_event);
+
+        return 0;
+}
+
 struct file_operations mem_notify_fops = {
        .open = mem_notify_open,
        .release = mem_notify_release,
        .poll = mem_notify_poll,
+        .read = mem_notify_read,
 };
 EXPORT_SYMBOL(mem_notify_fops);






/kosaki

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC][patch 2/2] mem notifications v3 improvement for large system
  2007-12-25  8:31   ` [PATCH] mem notifications v3 KOSAKI Motohiro
  2007-12-25 10:31     ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
@ 2007-12-25 10:31     ` KOSAKI Motohiro
  2007-12-25 10:41       ` KOSAKI Motohiro
  2007-12-27  4:49       ` [RFC][patch] mem_notify more faster reduce load average KOSAKI Motohiro
  1 sibling, 2 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 10:31 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

2nd improvement
  - add wakeup rate control

1. run >10000 process test
   console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
   console2# sh m.sh 12500

result
   - swap out unoccured.
   - time leap unoccured.
   - max runqueue shrink about 1/10.
   - too much freed unoccured.

very good.





Wed Dec 26 04:23:10 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 4  0      0   4122    190   9890    0    0   207    15  297  113 17  6 75  2  0
Wed Dec 26 04:23:11 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 3  0      0   3038    190  10809    0    0   206    15  299  117 17  7 75  2  0
Wed Dec 26 04:23:12 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0      0   2004    190  11687    0    0   206    15  301  120 17  7 75  2  0
Wed Dec 26 04:23:13 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0      0   1009    190  12530    0    0   205    15  303  124 17  7 74  2  0
Wed Dec 26 04:23:14 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0      0     69    190  13327    0    0   204    15  305  127 17  7 74  2  0
Wed Dec 26 04:23:15 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
1109  0      0     88    199  13294    0    0   203    15  404  297 17  7 74  2  0
Wed Dec 26 04:23:16 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
285  0      0     86    199  13295    0    0   203    15  404  541 17  7 74  2  0
Wed Dec 26 04:23:17 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
258  0      0     88    199  13294    0    0   202    15  404  779 17  7 74  2  0
Wed Dec 26 04:23:18 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
185  0      0     88    199  13294    0    0   201    15  403 1012 17  7 74  2  0
Wed Dec 26 04:23:19 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
454  0      0     87    199  13296    0    0   200    15  403 1240 17  7 74  2  0
Wed Dec 26 04:23:21 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
216  0      0     87    199  13295    0    0   200    15  403 1463 17  7 74  2  0
Wed Dec 26 04:23:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
402  0      0     87    199  13297    0    0   199    15  403 1681 17  7 74  2  0
Wed Dec 26 04:23:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
716  0      0     86    199  13293    0    0   198    15  403 1893 17  7 74  2  0
Wed Dec 26 04:23:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
131  0      0     86    199  13294    0    0   197    15  402 2101 17  7 74  2  0
Wed Dec 26 04:23:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
417  0      0     87    199  13294    0    0   197    14  402 2301 17  8 74  2  0
Wed Dec 26 04:23:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
42  0      0     87    199  13294    0    0   196    14  402 2502 17  8 74  2  0
Wed Dec 26 04:23:27 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
968  0      0     88    199  13291    0    0   195    14  402 2697 17  8 74  2  0
Wed Dec 26 04:23:28 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
335  0      0     86    199  13295    0    0   195    14  402 2887 17  8 74  2  0
Wed Dec 26 04:23:29 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
386  0      0     87    199  13293    0    0   194    14  401 3071 17  8 74  2  0
Wed Dec 26 04:23:30 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
658  0      0     89    199  13292    0    0   193    14  401 3254 17  8 74  2  0
Wed Dec 26 04:23:31 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
72  0      0     87    199  13295    0    0   192    14  401 3439 16  8 74  2  0
Wed Dec 26 04:23:32 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
697  0      0     86    199  13295    0    0   192    14  401 3612 16  8 74  2  0
Wed Dec 26 04:23:33 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
289  0      0     87    199  13293    0    0   191    14  400 3780 16  8 74  2  0
Wed Dec 26 04:23:34 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
633  0      0     87    199  13294    0    0   190    14  400 3944 16  8 74  2  0
Wed Dec 26 04:23:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 2  0      0     86    199  13295    0    0   190    14  400 4101 16  8 74  2  0
Wed Dec 26 04:23:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
94  1      0     88    199  13293    0    0   189    14  400 4253 16  8 74  2  0
Wed Dec 26 04:23:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
384  0      0     88    199  13293    0    0   188    14  400 4402 16  8 74  2  0
Wed Dec 26 04:23:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
256  0      0     86    199  13293    0    0   188    14  399 4546 16  8 74  2  0
Wed Dec 26 04:23:39 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     90    199  13288    0    0   187    14  399 4686 16  8 74  2  0
Wed Dec 26 04:23:40 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     90    199  13288    0    0   187    14  398 4822 16  8 74  2  0
Wed Dec 26 04:23:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
 0  1      0     90    199  13288    0    0   186    14  398 4953 16  8 74  2  0
Wed Dec 26 04:23:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
289  0      0     91    199  13288    0    0   185    14  397 5077 16  8 74  2  0



$ quilt diff
Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
===================================================================
--- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
+++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
@@ -12,6 +12,9 @@
 #include <linux/vmstat.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
+#include <linux/delay.h>
+
+#define MSLEEP_BONUS_SHIFT 4

 struct mem_notify_file_info {
         long          last_event;
@@ -20,7 +23,9 @@ struct mem_notify_file_info {
 atomic_t mem_notify_event = ATOMIC_INIT(0);

 static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
-static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
+static atomic_long_t last_mem_notify = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
+static atomic_long_t last_task_wakeup = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
+static atomic_t mem_notify_timeout_bonus = ATOMIC_INIT(0);

 /* maximum 5 notifications per second per cpu */
 void mem_notify_userspace(void)
@@ -28,10 +33,10 @@ void mem_notify_userspace(void)
        unsigned long target;
        unsigned long now = jiffies;

-       target = __get_cpu_var(last_mem_notify) + (HZ/5);
+       target = atomic_long_read(&last_mem_notify) + (HZ/5);

        if (time_after(now, target)) {
-               __get_cpu_var(last_mem_notify) = now;
+               atomic_long_set(&last_mem_notify, now);
                atomic_inc(&mem_notify_event);
                wake_up(&mem_wait);
        }
@@ -68,12 +73,35 @@ static unsigned int mem_notify_poll(stru
        struct zone *zone;
        int pages_high, pages_free, pages_reserve;
         struct mem_notify_file_info *file_info = file->private_data;
+       unsigned long bonus;
+       unsigned long now;
+       unsigned long last;

        poll_wait(file, &mem_wait, wait);

         if (file_info->last_event == atomic_read(&mem_notify_event))
                 goto out;

+retry:
+       /* Ugly trick:
+          when too many task wakeup,
+          control function exit rate for prevent too much freed.
+       */
+       now = jiffies;
+       last = (unsigned long)atomic_long_read(&last_task_wakeup);
+        if (time_before_eq(now, last)) {
+               bonus = atomic_read(&mem_notify_timeout_bonus) >>
+                       MSLEEP_BONUS_SHIFT;
+                msleep_interruptible(1 + bonus);
+               set_current_state(TASK_INTERRUPTIBLE);
+                if (signal_pending(current))
+                        goto out;
+                atomic_inc(&mem_notify_timeout_bonus);
+                goto retry;
+        }
+        atomic_set(&mem_notify_timeout_bonus, 0);
+        atomic_long_set(&last_task_wakeup, now);
+
        /* check if its not a spurious/stale notification */
        pages_high = pages_free = pages_reserve = 0;
        for_each_zone(zone) {


/kosaki


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][patch 2/2] mem notifications v3 improvement for large system
  2007-12-25 10:31     ` [RFC][patch 2/2] " KOSAKI Motohiro
@ 2007-12-25 10:41       ` KOSAKI Motohiro
  2007-12-27  4:49       ` [RFC][patch] mem_notify more faster reduce load average KOSAKI Motohiro
  1 sibling, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-25 10:41 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

2nd improvement
test2

2. after test1, run file I/O
   console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
   console2# dd if=/dev/zero of=tmp bs=100M count=10

result:
   - swap out unoccured.
   - cache increase about 1GB.
   - anon freed about 1GB.

very good!


$ pgrep mem_notify|wc -l
11079
$ dd if=/dev/zero of=tmp1 bs=100M count=10
$ pgrep mem_notify|wc -l
10307

$ LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M ; done
Wed Dec 26 04:36:19 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   425  785 3145  5  4 89  1  0
Wed Dec 26 04:36:20 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   424  784 3142  5  4 89  1  0
Wed Dec 26 04:36:21 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   424  784 3139  5  4 89  1  0
Wed Dec 26 04:36:22 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   424  783 3136  5  4 89  1  0
Wed Dec 26 04:36:23 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  1      0     70     42    211    0    0    54   423  783 3133  5  4 89  1  0
Wed Dec 26 04:36:24 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   423  782 3130  5  4 89  1  0
Wed Dec 26 04:36:25 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   422  782 3128  5  4 89  1  0
Wed Dec 26 04:36:26 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  0      0     70     42    211    0    0    54   422  781 3125  5  4 89  1  0
Wed Dec 26 04:36:35 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
820  6      0     89     45   1052    0    0    53   482 1133 3466  5  5 89  1  0
Wed Dec 26 04:36:36 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
355  6      0     87     45   1124    0    0    53   497 1132 3521  5  5 89  1  0
Wed Dec 26 04:36:37 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
241  6      0     88     45   1188    0    0    53   512 1132 3576  5  5 89  1  0
Wed Dec 26 04:36:38 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  3      0     93     45   1208    0    0    53   529 1131 3632  5  5 89  1  0
Wed Dec 26 04:36:39 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 0  4      0     93     45   1208    0    0    53   545 1130 3687  5  5 89  1  0
Wed Dec 26 04:36:40 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
830  4      0     93     45   1208    0    0    53   560 1129 3741  5  5 89  2  0
Wed Dec 26 04:36:41 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
103  4      0     93     45   1208    0    0    53   575 1128 3794  5  5 89  2  0
Wed Dec 26 04:36:42 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
133  4      0     94     45   1208    0    0    53   587 1128 3846  5  5 89  2  0
Wed Dec 26 04:36:43 JST 2007
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
59  4      0     97     45   1208    0    0    53   603 1127 3898  5  5 88  2  0


/kosaki

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [RFC][patch] mem_notify more faster reduce load average
  2007-12-25 10:31     ` [RFC][patch 2/2] " KOSAKI Motohiro
  2007-12-25 10:41       ` KOSAKI Motohiro
@ 2007-12-27  4:49       ` KOSAKI Motohiro
  1 sibling, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-27  4:49 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

Hi, Marcelo-san

this patch is a bit improvement against my mem notifications large system patch.
original my patch is too slower reduce load average at after free memory increased.
this patch fixed it.



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
===================================================================
--- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
+++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
@@ -20,7 +20,8 @@ struct mem_notify_file_info {
         long          last_event;
 };

-atomic_t mem_notify_event = ATOMIC_INIT(0);
+static atomic_t mem_notify_event = ATOMIC_INIT(0);
+static atomic_t mem_notify_event_end = ATOMIC_INIT(0);

 static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
 static atomic_long_t last_mem_notify = ATOMIC_LONG_INIT(INITIAL_JIFFIES);
@@ -76,13 +77,18 @@ static unsigned int mem_notify_poll(stru
        unsigned long bonus;
        unsigned long now;
        unsigned long last;
+       unsigned long event;

        poll_wait(file, &mem_wait, wait);

-        if (file_info->last_event == atomic_read(&mem_notify_event))
+retry:
+       event = atomic_read(&mem_notify_event);
+       if (event == file_info->last_event)
                 goto out;

-retry:
+       if (event == atomic_read(&mem_notify_event_end))
+               goto out;
+
        /* Ugly trick:
           when too many task wakeup,
           control function exit rate for prevent too much freed.
@@ -114,6 +120,8 @@ retry:

        if (pages_free < (pages_high+pages_reserve)*2)
                val = POLLIN;
+       else
+               atomic_set(&mem_notify_event_end, event);

 out:



- kosaki




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mem notifications v3
  2007-12-25  3:47 ` KOSAKI Motohiro
  2007-12-25  4:56   ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
  2007-12-25  8:31   ` [PATCH] mem notifications v3 KOSAKI Motohiro
@ 2007-12-27 20:13   ` Marcelo Tosatti
  2007-12-28  1:44     ` KOSAKI Motohiro
  2 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-27 20:13 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Marcelo Tosatti, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

Hi Kosaki,

On Tue, Dec 25, 2007 at 12:47:49PM +0900, KOSAKI Motohiro wrote:
> Hi
> 
> 
> > +/* maximum 5 notifications per second per cpu */
> > +void mem_notify_userspace(void)
> > +{
> > +	unsigned long target;
> > +	unsigned long now = jiffies;
> > +
> > +	target = __get_cpu_var(last_mem_notify) + (HZ/5);
> > +
> > +	if (time_after(now, target)) {
> > +		__get_cpu_var(last_mem_notify) = now;
> > +		mem_notify_status = 1;
> > +		wake_up(&mem_wait);
> > +	}
> > +}
> 
> Hmm,
> unfotunately, wake_up() wake up all process.
> because
>  1. poll method use poll_wait().
>  2. poll_wait() not add_wait_queue_exclusive() but add_wait_queue() is used. 
>  3. wake_up() function wake up 1 task *and* queueud item by add_wait_queue().
> 
> Conclusion:
> this code intention wakeup all process HZ/5 * #cpus times at high memory pressure.
> it is too much.
> 
> 
> BTW: I propose add to poll_wait_exclusive() in kernel ;-p
> 
> 
> > +		/* check if its not a spurious/stale notification */
> > +		pages_high = pages_free = pages_reserve = 0;
> > +		for_each_zone(zone) { 
> > +			if (!populated_zone(zone) || is_highmem(zone))
> > +				continue;
> 
> i think highmem ignoreed is very good improvement from before version :-D
> 
> 
> > +			pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
> 
> Hmm...
> may be, don't works well.
> 
> MAX_NR_ZONES determined at compile time and determined by distribution vendor.
> but real highest zone is determined by box total memory.
> 
> ex.
> CONFIG_HIGHMEM config on but total memory < 4GB.
> CONFIG_DMA32 config on but total memory < 4GB.

That is OK because the calculation of lowmem reserves will take into account 
all zones (mm/page_alloc.c::setup_per_zone_lowmem_reserve).

But it might be better to use the precalculated totalreserve_pages instead.

> 
> > +		if (pages_free < (pages_high+pages_reserve)*2) 
> > +			val = POLLIN;
> 
> why do you choice fomula of (pages_high+pages_reserve)*2 ?

Just to make sure its not sending a spurious notification in the case the system
has enough free memory already.

> > -static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > +static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
> >  				struct scan_control *sc, int priority)
> 
> unnecessary type change.
> if directly call mem_notify_userspace() in shrink_active_list, works well too.
> because notify rate control can implement by mem_notify_userspace() and mem_notify_poll().

Yes, and doing that should also guarantee that the notification is sent
before swapout is performed (right now it sends the notification after
shrink_inactive_list(), which is performing swapout).

> last_mem_notify works better.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][patch 1/2] mem notifications v3 improvement for large system
  2007-12-25 10:31     ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
@ 2007-12-27 21:04       ` Marcelo Tosatti
  2007-12-28  0:38         ` KOSAKI Motohiro
  0 siblings, 1 reply; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-27 21:04 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Marcelo Tosatti, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

On Tue, Dec 25, 2007 at 07:31:14PM +0900, KOSAKI Motohiro wrote:
> Hi
> 
> I tried resolve too few notification problem.
> 
> mem_notify_status global variable mean wakeup 1 process.
> it is too few.
> 
> improvement step1:
> - add read method and wake up all process.
> 
> 1. run >10000 process test
>    console1# LANG=C; while [ 1 ] ;do sleep 1; date; vmstat 1 1 -S M -a; done
>    console2# sh m.sh 12500
> 
> result:
>  - wakeup all unoccur neither thundering herd nor soft lock-up.
>  - no swap out occured.
>  - but too much free ;-)
>    in my test-case, over 5GB freed.
> 
> 
> Wed Dec 26 03:19:20 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  7  0      0    605    209  12778    0    0   143    11 1458  183 14 10 76  1  0
> Wed Dec 26 03:19:21 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  6  0      0   2687    209  10769    0    0   142    11 1459  188 14 10 75  1  0
> Wed Dec 26 03:19:22 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  2  0      0   4560    209   8968    0    0   142    11 1459  191 14 10 75  1  0
> Wed Dec 26 03:19:23 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5857    209   7724    0    0   142    11 1457  192 14 10 75  1  0
> Wed Dec 26 03:19:24 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5872    209   7724    0    0   141    11 1454  192 14 10 75  1  0
> Wed Dec 26 03:19:25 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5884    209   7724    0    0   141    11 1451  192 14 10 75  1  0
> Wed Dec 26 03:19:26 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5895    209   7724    0    0   140    11 1448  191 14 10 75  1  0
> Wed Dec 26 03:19:27 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5904    209   7724    0    0   140    11 1445  191 14 10 75  1  0
> Wed Dec 26 03:19:28 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5912    209   7724    0    0   140    11 1442  190 13 10 75  1  0
> Wed Dec 26 03:19:29 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5920    209   7724    0    0   139    11 1439  190 13 10 75  1  0
> Wed Dec 26 03:19:30 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  1      0   5929    209   7724    0    0   139    11 1436  189 13 10 75  1  0
> Wed Dec 26 03:19:32 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5935    209   7724    0    0   139    11 1433  189 13 10 75  1  0
> Wed Dec 26 03:19:33 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  1  0      0   5940    209   7724    0    0   138    11 1430  188 13 10 75  1  0
> Wed Dec 26 03:19:34 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  2  1      0   5948    209   7725    0    0   138    11 1427  188 13 10 75  1  0
> Wed Dec 26 03:19:35 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  0  0      0   5676    209   8005    0    0   138    11 1425  188 13 10 75  1  0
> Wed Dec 26 03:19:36 JST 2007
> procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------
>  r  b   swpd   free  inact active   si   so    bi    bo   in   cs us sy id wa st
>  0  1      0   5676    209   8006    0    0   137    11 1422  188 13 10 75  1  0
> 
> 
> Index: linux-2.6.23-mem_notify_v3/mm/mem_notify.c
> ===================================================================
> --- linux-2.6.23-mem_notify_v3.orig/mm/mem_notify.c
> +++ linux-2.6.23-mem_notify_v3/mm/mem_notify.c
> @@ -13,7 +13,11 @@
>  #include <linux/percpu.h>
>  #include <linux/timer.h>
> 
> -static unsigned long mem_notify_status = 0;
> +struct mem_notify_file_info {
> +        long          last_event;
> +};
> +
> +atomic_t mem_notify_event = ATOMIC_INIT(0);
> 
>  static DECLARE_WAIT_QUEUE_HEAD(mem_wait);
>  static DEFINE_PER_CPU(unsigned long, last_mem_notify) = INITIAL_JIFFIES;
> @@ -28,53 +32,81 @@ void mem_notify_userspace(void)
> 
>         if (time_after(now, target)) {
>                 __get_cpu_var(last_mem_notify) = now;
> -               mem_notify_status = 1;
> +               atomic_inc(&mem_notify_event);
>                 wake_up(&mem_wait);
>         }
>  }
> 
>  static int mem_notify_open(struct inode *inode, struct file *file)
>  {
> -       return 0;
> +        struct mem_notify_file_info *ptr;
> +        int    err = 0;
> +
> +        ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
> +        if (!ptr) {
> +                err = -ENOMEM;
> +                goto out;
> +        }
> +
> +        ptr->last_event = atomic_read(&mem_notify_event);
> +        file->private_data = ptr;
> +
> +out:
> +        return err;
>  }
> 
>  static int mem_notify_release(struct inode *inode, struct file *file)
>  {
> +        kfree(file->private_data);
> +
>         return 0;
>  }
> 
>  static unsigned int mem_notify_poll(struct file *file, poll_table *wait)
>  {
>         unsigned int val = 0;
> +       struct zone *zone;
> +       int pages_high, pages_free, pages_reserve;
> +        struct mem_notify_file_info *file_info = file->private_data;
> 
>         poll_wait(file, &mem_wait, wait);
> 
> -       if (mem_notify_status) {
> -               struct zone *zone;
> -               int pages_high, pages_free, pages_reserve;
> -
> -               mem_notify_status = 0;

By clearing mem_notify_status you avoid other processes going
through mem_notify_poll() from having a spurious notification if
memory is not exhausted. So the real point of notification is the
call from shrink_zone() (or shrink_active_list()), not the pages <
(pages_high+pages_reserve)*2 formula.

So something like the following sounds better:

- have your poll_wait_exclusive() patch in place
- pass a "status" parameter to mem_notify_userspace() and have it clear
mem_notify_status in case status is zero, so to stop sending POLLIN to processes.
- call mem_notify_userspace(0) from mm/vmscan.c when ZONE_NORMAL reclaim_mapped 
is false (that seems a good indication that VM is out of trouble).
- test for mem_notify_status in mem_notify_poll(), but do not clear it.
- at mem_notify_userspace(), use wake_up_nr(number of mem_notify users/10) (10
meaning a small percentage of registered users).


> -
> -               /* check if its not a spurious/stale notification */
> -               pages_high = pages_free = pages_reserve = 0;
> -               for_each_zone(zone) {
> -                       if (!populated_zone(zone) || is_highmem(zone))
> -                               continue;
> -                       pages_high += zone->pages_high;
> -                       pages_free += zone_page_state(zone, NR_FREE_PAGES);
> -                       pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
> -               }
> +        if (file_info->last_event == atomic_read(&mem_notify_event))
> +                goto out;

What exactly are you trying to deal with by using last_event?

> 
> -               if (pages_free < (pages_high+pages_reserve)*2)
> -                       val = POLLIN;
> +       /* check if its not a spurious/stale notification */
> +       pages_high = pages_free = pages_reserve = 0;
> +       for_each_zone(zone) {
> +               if (!populated_zone(zone) || is_highmem(zone))
> +                       continue;
> +               pages_high += zone->pages_high;
> +               pages_free += zone_page_state(zone, NR_FREE_PAGES);
> +               pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
>         }
> -
> +
> +       if (pages_free < (pages_high+pages_reserve)*2)
> +               val = POLLIN;
> +
> +out:
>         return val;
>  }
> 
> +static ssize_t mem_notify_read(struct file *file, char __user *buf,
> +                               size_t count, loff_t *ppos)
> +{
> +        struct mem_notify_file_info *file_info = file->private_data;
> +        if (!file_info)
> +                return -EINVAL;
> +
> +        file_info->last_event = atomic_read(&mem_notify_event);
> +
> +        return 0;
> +}

Your then using read() to affect poll() behavior. I don't like it.

> +
>  struct file_operations mem_notify_fops = {
>         .open = mem_notify_open,
>         .release = mem_notify_release,
>         .poll = mem_notify_poll,
> +        .read = mem_notify_read,
>  };
>  EXPORT_SYMBOL(mem_notify_fops);
> 
> 
> 
> 
> 
> 
> /kosaki

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] add poll_wait_exclusive() API
  2007-12-25  4:56   ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
@ 2007-12-27 21:05     ` Marcelo Tosatti
  0 siblings, 0 replies; 13+ messages in thread
From: Marcelo Tosatti @ 2007-12-27 21:05 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: linux-mm, Marcelo Tosatti, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

On Tue, Dec 25, 2007 at 01:56:24PM +0900, KOSAKI Motohiro wrote:
> Hi
> 
> add item to wait queue exist 2 way, add_wait_queue() and add_wait_queue_exclusive().
> but unfortunately, we only able to use poll_wait in poll method.
> 
> poll_wait_exclusive() works similar as add_wait_queue_exclusive()
> 
> 
> caution:
>   this patch is compile test only.
>   my purpose is discussion only.

Looks good. 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][patch 1/2] mem notifications v3 improvement for large system
  2007-12-27 21:04       ` Marcelo Tosatti
@ 2007-12-28  0:38         ` KOSAKI Motohiro
  0 siblings, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-28  0:38 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

Hi Marcelo-san

thank you for your advice.

> So something like the following sounds better:
> 
> - have your poll_wait_exclusive() patch in place
> - pass a "status" parameter to mem_notify_userspace() and have it clear
> mem_notify_status in case status is zero, so to stop sending POLLIN to processes.
> - call mem_notify_userspace(0) from mm/vmscan.c when ZONE_NORMAL reclaim_mapped 
> is false (that seems a good indication that VM is out of trouble).
> - test for mem_notify_status in mem_notify_poll(), but do not clear it.
> - at mem_notify_userspace(), use wake_up_nr(number of mem_notify users/10) (10
> meaning a small percentage of registered users).

feel nice idea.
OK. I will try it about new year.


> > +        if (file_info->last_event == atomic_read(&mem_notify_event))
> > +                goto out;
> 
> What exactly are you trying to deal with by using last_event?

to be honest, read() and last_event is daniel-san's idea.
it is part of sysfs code in his patch.
my patch intent the same behavior as his.

1. read() method is deletable if you dislike.
   I will delete at next post :)
2. last_event is not deletable, it is important.
   when storong and long memory pressure,
   notification received process call poll() again after own cache freed
   but before out of trouble.
   at that point, the process shold not wakeup because already memory freed.
   (in other word, poll shold return 0.)



- kosaki


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] mem notifications v3
  2007-12-27 20:13   ` [PATCH] mem notifications v3 Marcelo Tosatti
@ 2007-12-28  1:44     ` KOSAKI Motohiro
  0 siblings, 0 replies; 13+ messages in thread
From: KOSAKI Motohiro @ 2007-12-28  1:44 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kosaki.motohiro, linux-mm, Daniel Sp蚣g, Rik van Riel,
	Andrew Morton

Hi Marcelo-san

> > > +			pages_reserve += zone->lowmem_reserve[MAX_NR_ZONES-1];
> > 
> > Hmm...
> > may be, don't works well.
> > 
> > MAX_NR_ZONES determined at compile time and determined by distribution vendor.
> > but real highest zone is determined by box total memory.
> 
> That is OK because the calculation of lowmem reserves will take into account 
> all zones (mm/page_alloc.c::setup_per_zone_lowmem_reserve).

really?
sorry, I will check again.


> But it might be better to use the precalculated totalreserve_pages instead.

Hmm...
unfortunately, accumulate of all zone memory is incompatible to NUMA awareness.
please think again.


> > > +		if (pages_free < (pages_high+pages_reserve)*2) 
> > > +			val = POLLIN;
> > 
> > why do you choice fomula of (pages_high+pages_reserve)*2 ?
> 
> Just to make sure its not sending a spurious notification in the case the system
> has enough free memory already.

Can I think "*2" is your experimental rule? 
if so, I agree your experience.


> > > -static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > > +static bool shrink_active_list(unsigned long nr_pages, struct zone *zone,
> > >  				struct scan_control *sc, int priority)
> > 
> > unnecessary type change.
> > if directly call mem_notify_userspace() in shrink_active_list, works well too.
> > because notify rate control can implement by mem_notify_userspace() and mem_notify_poll().
> 
> Yes, and doing that should also guarantee that the notification is sent
> before swapout is performed (right now it sends the notification after
> shrink_inactive_list(), which is performing swapout).

Agreed.


- kosaki

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2007-12-28  1:44 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-12-24 20:32 [PATCH] mem notifications v3 Marcelo Tosatti
2007-12-25  3:47 ` KOSAKI Motohiro
2007-12-25  4:56   ` [RFC] add poll_wait_exclusive() API KOSAKI Motohiro
2007-12-27 21:05     ` Marcelo Tosatti
2007-12-25  8:31   ` [PATCH] mem notifications v3 KOSAKI Motohiro
2007-12-25 10:31     ` [RFC][patch 1/2] mem notifications v3 improvement for large system KOSAKI Motohiro
2007-12-27 21:04       ` Marcelo Tosatti
2007-12-28  0:38         ` KOSAKI Motohiro
2007-12-25 10:31     ` [RFC][patch 2/2] " KOSAKI Motohiro
2007-12-25 10:41       ` KOSAKI Motohiro
2007-12-27  4:49       ` [RFC][patch] mem_notify more faster reduce load average KOSAKI Motohiro
2007-12-27 20:13   ` [PATCH] mem notifications v3 Marcelo Tosatti
2007-12-28  1:44     ` KOSAKI Motohiro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox