linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] per-zone kswapd process
@ 2002-09-13  3:33 Dave Hansen
  2002-09-13  4:06 ` Andrew Morton
  0 siblings, 1 reply; 14+ messages in thread
From: Dave Hansen @ 2002-09-13  3:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Martin J. Bligh, William Lee Irwin III, linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1119 bytes --]

This patch implements a kswapd process for each memory zone.  The original code 
came from Bill Irwin, but the current VM is quite a bit different from the one 
that he wrote it for, so not much remains.  The current kswapd interface is much 
more simple than before because there is a single waitqueue and there is a 
single place where it is emptied.

kswapd_can_sleep() and kswapd_balance() are simpler now that the extra pgdat 
level of indirection is gone.

Tested on 8-way PIII with highmem off and then 4GB support.  With 4GB support, I 
did 20 parallel greps through a 10GB fileset while some other processes 
allocated and freed 1-2GB chunks of memory.  That gave kswapd a good workout, 
and I observed it running the zone Highmem and zone Normal kswapd threads.  So, 
it survives my torture test.  It also removes more code than it adds.

include/linux/mmzone.h |    2 +
include/linux/swap.h   |    1
mm/page_alloc.c        |   11 +++++-
mm/vmscan.c            |   88 +++++++++++++++++--------------------------------
4 files changed, 42 insertions(+), 60 deletions(-)

-- 
Dave Hansen
haveblue@us.ibm.com




[-- Attachment #2: per-zone-kswapd-2.5.34-mm2-3.patch --]
[-- Type: text/plain, Size: 6092 bytes --]

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.625   -> 1.628  
#	include/linux/mmzone.h	1.19    -> 1.20   
#	include/linux/swap.h	1.57    -> 1.58   
#	     mm/page_alloc.c	1.98    -> 1.101  
#	         mm/vmscan.c	1.102   -> 1.105  
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/09/12	haveblue@elm3b96.(none)	1.626
# add per-zone kswapd
# --------------------------------------------
# 02/09/12	haveblue@elm3b96.(none)	1.627
# fix some wli-indicated formatting bits
# --------------------------------------------
# 02/09/12	haveblue@elm3b96.(none)	1.628
# move waitqueue init to a more appropriate place 
# --------------------------------------------
#
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h	Thu Sep 12 20:24:39 2002
+++ b/include/linux/mmzone.h	Thu Sep 12 20:24:39 2002
@@ -108,6 +108,8 @@
 	unsigned long		wait_table_size;
 	unsigned long		wait_table_bits;
 
+	wait_queue_head_t       kswapd_wait;	
+	
 	/*
 	 * Discontig memory support fields.
 	 */
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h	Thu Sep 12 20:24:39 2002
+++ b/include/linux/swap.h	Thu Sep 12 20:24:39 2002
@@ -162,7 +162,6 @@
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern wait_queue_head_t kswapd_wait;
 extern int try_to_free_pages(struct zone *, unsigned int, unsigned int);
 
 /* linux/mm/page_io.c */
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	Thu Sep 12 20:24:39 2002
+++ b/mm/page_alloc.c	Thu Sep 12 20:24:39 2002
@@ -345,8 +345,15 @@
 	classzone->need_balance = 1;
 	mb();
 	/* we're somewhat low on memory, failed to find what we needed */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *z = zones[i];
+
+		/* We don't want to go swapping on zones that aren't actually
+		 * low.  This accounts for "incremental min" from last loop */
+		if (z->free_pages <= z->pages_low &&
+		    waitqueue_active(&z->kswapd_wait)) 
+			wake_up_interruptible(&z->kswapd_wait);
+	}
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
 	min = 1UL << order;
@@ -874,6 +881,8 @@
 		for(i = 0; i < zone->wait_table_size; ++i)
 			init_waitqueue_head(zone->wait_table + i);
 
+		init_waitqueue_head(&zone->kswapd_wait);
+		
 		pgdat->nr_zones = j+1;
 
 		mask = (realsize / zone_balance_ratio[j]);
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c	Thu Sep 12 20:24:39 2002
+++ b/mm/vmscan.c	Thu Sep 12 20:24:39 2002
@@ -713,8 +713,6 @@
 	return 0;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
 static int check_classzone_need_balance(struct zone *classzone)
 {
 	struct zone *first_classzone;
@@ -728,71 +726,33 @@
 	return 1;
 }
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+static int kswapd_balance_zone(struct zone *zone)
 {
-	int need_more_balance = 0, i;
-	struct zone *zone;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
+	int need_more_balance = 0;
+	
+	do {
 		cond_resched();
 		if (!zone->need_balance)
-			continue;
+			break;
 		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 			zone->need_balance = 0;
 			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ);
-			continue;
+			break;
 		}
 		if (check_classzone_need_balance(zone))
 			need_more_balance = 1;
 		else
 			zone->need_balance = 0;
-	}
-
-	return need_more_balance;
-}
-
-static void kswapd_balance(void)
-{
-	int need_more_balance;
-	pg_data_t * pgdat;
-
-	do {
-		need_more_balance = 0;
-		pgdat = pgdat_list;
-		do
-			need_more_balance |= kswapd_balance_pgdat(pgdat);
-		while ((pgdat = pgdat->pgdat_next));
 	} while (need_more_balance);
-}
 
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-	struct zone *zone;
-	int i;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
-			continue;
-		return 0;
-	}
-
-	return 1;
+	return 0;
 }
 
-static int kswapd_can_sleep(void)
+static int kswapd_can_sleep_zone(struct zone *zone)
 {
-	pg_data_t * pgdat;
-
-	pgdat = pgdat_list;
-	do {
-		if (kswapd_can_sleep_pgdat(pgdat))
-			continue;
-		return 0;
-	} while ((pgdat = pgdat->pgdat_next));
-
+	if (zone->need_balance)
+		return 0;	
 	return 1;
 }
 
@@ -809,13 +769,18 @@
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
-int kswapd(void *unused)
+int kswapd_zone(void *p)
 {
+	struct zone *zone = (struct zone *)p;
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
+	
+	printk( "kswapd%d starting for %s\n", 
+			zone - zone->zone_pgdat->node_zones, 
+			zone->name);
 
 	daemonize();
-	strcpy(tsk->comm, "kswapd");
+	sprintf(tsk->comm, "kswapd%d", zone - zone->zone_pgdat->node_zones);
 	sigfillset(&tsk->blocked);
 	
 	/*
@@ -839,30 +804,37 @@
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
 		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kswapd_wait, &wait);
+		add_wait_queue(&zone->kswapd_wait, &wait);
 
 		mb();
-		if (kswapd_can_sleep())
+		if (kswapd_can_sleep_zone(zone))
 			schedule();
 
 		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&kswapd_wait, &wait);
+		remove_wait_queue(&zone->kswapd_wait, &wait);
 
 		/*
 		 * If we actually get into a low-memory situation,
 		 * the processes needing more memory will wake us
 		 * up on a more timely basis.
 		 */
-		kswapd_balance();
+		kswapd_balance_zone(zone);
 		blk_run_queues();
 	}
 }
 
 static int __init kswapd_init(void)
 {
+	struct zone* zone;
+
 	printk("Starting kswapd\n");
 	swap_setup();
-	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	for_each_zone(zone)
+		if (zone->size)
+			kernel_thread(kswapd_zone, 
+				      zone, 
+				      CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  3:33 [PATCH] per-zone kswapd process Dave Hansen
@ 2002-09-13  4:06 ` Andrew Morton
  2002-09-13  4:59   ` William Lee Irwin III
  0 siblings, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2002-09-13  4:06 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Martin J. Bligh, William Lee Irwin III, linux-kernel, linux-mm

Dave Hansen wrote:
> 
> This patch implements a kswapd process for each memory zone.

I still don't see why it's per zone and not per node.  It seems strange
that a wee little laptop would be running two kswapds?

kswapd can get a ton of work done in the development VM and one per
node would, I expect, suffice?

Also, I'm wondering why the individual kernel threads don't have
their affinity masks set to make them run on the CPUs to which the
zone (or zones) are local?

Isn't it the case that with this code you could end up with a kswapd
on node 0 crunching on node 1's pages while a kswapd on node 1 crunches
on node 0's pages?

If I'm not totally out to lunch on this, I'd have thought that a
better approach would be

	int sys_kswapd(int nid)
	{
		return kernel_thread(kswapd, ...);
	}

Userspace could then set up the CPU affinity based on some topology
or config information and would then parent a kswapd instance.  That
kswapd instance would then be bound to the CPUs which were on the
node identified by `nid'.

Or something like that?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  4:06 ` Andrew Morton
@ 2002-09-13  4:59   ` William Lee Irwin III
  2002-09-13  5:10     ` Martin J. Bligh
                       ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: William Lee Irwin III @ 2002-09-13  4:59 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dave Hansen, Martin J. Bligh, linux-kernel, linux-mm

On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> I still don't see why it's per zone and not per node.  It seems strange
> that a wee little laptop would be running two kswapds?
> kswapd can get a ton of work done in the development VM and one per
> node would, I expect, suffice?

Machines without observable NUMA effects can benefit from it if it's
per-zone. It also follows that if there's more than one task doing this,
page replacement is less likely to block entirely. Last, but not least,
when I devised it, "per-zone" was the theme.


On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> Also, I'm wondering why the individual kernel threads don't have
> their affinity masks set to make them run on the CPUs to which the
> zone (or zones) are local?
> Isn't it the case that with this code you could end up with a kswapd
> on node 0 crunching on node 1's pages while a kswapd on node 1 crunches
> on node 0's pages?

Without some architecture-neutral method of topology detection, there's
no way to do this. A follow-up when it's there should fix it.


On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> If I'm not totally out to lunch on this, I'd have thought that a
> better approach would be
> 	int sys_kswapd(int nid)
> 	{
> 		return kernel_thread(kswapd, ...);
> 	}
> Userspace could then set up the CPU affinity based on some topology
> or config information and would then parent a kswapd instance.  That
> kswapd instance would then be bound to the CPUs which were on the
> node identified by `nid'.
> Or something like that?

I'm very very scared of handing things like that to userspace, largely
because I don't trust userspace at all.

At this point, we need to enumerate nodes and provide a cpu to node
correspondence to userspace, and the kernel can obey, aside from the
question of "What do we do if we need to scan a node without a kswapd
started yet?". I think mbligh recently got the long-needed arch code in
for cpu to node... But I'm just not able to make the leap of faith that
memory detection is something that can ever comfortably be given to
userspace.


Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  4:59   ` William Lee Irwin III
@ 2002-09-13  5:10     ` Martin J. Bligh
  2002-09-13  5:46     ` Andrew Morton
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Martin J. Bligh @ 2002-09-13  5:10 UTC (permalink / raw)
  To: William Lee Irwin III, Andrew Morton
  Cc: Dave Hansen, linux-kernel, linux-mm, colpatch

>> Also, I'm wondering why the individual kernel threads don't have
>> their affinity masks set to make them run on the CPUs to which the
>> zone (or zones) are local?
>> Isn't it the case that with this code you could end up with a kswapd
>> on node 0 crunching on node 1's pages while a kswapd on node 1 crunches
>> on node 0's pages?
> 
> Without some architecture-neutral method of topology detection, there's
> no way to do this. A follow-up when it's there should fix it.

Every discontigmem arch should implement cpu_to_node, with a generic
fallback mechanism that returns 0 or something. Not that we do right
now, but that's easy to fix. There should also be a node_to_cpus call
that returns a bitmask of which cpus are in that node.

Matt ... want to sneak in the first bit of the topology patch, or
whatever lump this fell under? Seems like an appropriate juncture.
We have the code already somewhere, just need to fish it out.

>> If I'm not totally out to lunch on this, I'd have thought that a
>> better approach would be
>> 	int sys_kswapd(int nid)
>> 	{
>> 		return kernel_thread(kswapd, ...);
>> 	}
>> Userspace could then set up the CPU affinity based on some topology
>> or config information and would then parent a kswapd instance.  That
>> kswapd instance would then be bound to the CPUs which were on the
>> node identified by `nid'.
>> Or something like that?
> 
> I'm very very scared of handing things like that to userspace, largely
> because I don't trust userspace at all.
> 
> At this point, we need to enumerate nodes and provide a cpu to node
> correspondence to userspace, and the kernel can obey, aside from the
> question of "What do we do if we need to scan a node without a kswapd
> started yet?". I think mbligh recently got the long-needed arch code in
> for cpu to node... But I'm just not able to make the leap of faith that
> memory detection is something that can ever comfortably be given to
> userspace.

I don't think the userspace stuff is necessary - we can do this all 
in the kernel dead easily I think. Just need a couple of definitions,
which are trivially small functions.

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  5:46     ` Andrew Morton
@ 2002-09-13  5:38       ` Martin J. Bligh
  2002-09-13  6:03         ` Andrew Morton
  0 siblings, 1 reply; 14+ messages in thread
From: Martin J. Bligh @ 2002-09-13  5:38 UTC (permalink / raw)
  To: Andrew Morton, William Lee Irwin III; +Cc: Dave Hansen, linux-kernel, linux-mm

> Sorry, I don't buy that.
> 
> a) It does not need to be architecture neutral.  
> 
> b) You surely need a way of communicating the discovered topology
>    to userspace anyway.
> 
> c) $EDITOR /etc/numa-layouf.conf
> 
> d) $EDITOR /etc/kswapd.conf

I guess you could do that, but it seems overly complicated to me.
  
>> I think mbligh recently got the long-needed arch code in
>> for cpu to node... But I'm just not able to make the leap of faith that
>> memory detection is something that can ever comfortably be given to
>> userspace.
> 
> A simple syscall which alows you to launch a kswapd instance against
> a group of zones on any group of CPUs provides complete generality 
> and flexibility to userspace.  And it is architecture neutral.
> 
> If it really is incredibly hard to divine the topology from userspace
> then you need to fix that up.  Provide the topology to userspace.
> Which has the added benefit of providing, umm, the topology to userspace ;)

Can we make a simple default of 1 per node, which is what 99% 
of people want, and then make it more complicated later if people 
complain? It's really pretty easy:

for (node = 0; node < numnodes; ++node) {
	kswapd = kick_off_kswapd_for_node(node);
	kswapd->cpus_allowed = node_to_cpus(node);
}

Or whatever the current cpus_allowed method is. All we seem to need
is node_to_cpus ... I can give that to you tommorow with no problem,
it's trivial.

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  4:59   ` William Lee Irwin III
  2002-09-13  5:10     ` Martin J. Bligh
@ 2002-09-13  5:46     ` Andrew Morton
  2002-09-13  5:38       ` Martin J. Bligh
  2002-09-13 13:05     ` Alan Cox
  2002-09-16  5:44     ` [PATCH] per-zone kswapd process Daniel Phillips
  3 siblings, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2002-09-13  5:46 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Dave Hansen, Martin J. Bligh, linux-kernel, linux-mm

William Lee Irwin III wrote:
> 
> On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> > I still don't see why it's per zone and not per node.  It seems strange
> > that a wee little laptop would be running two kswapds?
> > kswapd can get a ton of work done in the development VM and one per
> > node would, I expect, suffice?
> 
> Machines without observable NUMA effects can benefit from it if it's
> per-zone. It also follows that if there's more than one task doing this,
> page replacement is less likely to block entirely. Last, but not least,
> when I devised it, "per-zone" was the theme.

Maybe, marginally.  You could pass a gfp mask to sys_kswapd to select
the zones if that's really a benefit.  But if this _is_ a benefit then
it's a VM bug.  

Because if a single kswapd cannot service three zones then it cannot
service one zone. (Maybe.  We need to do per-zone throttling soon to
fix your OOM problems properly, but then, that shouldn't throttle
kswapd).

> On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> > Also, I'm wondering why the individual kernel threads don't have
> > their affinity masks set to make them run on the CPUs to which the
> > zone (or zones) are local?
> > Isn't it the case that with this code you could end up with a kswapd
> > on node 0 crunching on node 1's pages while a kswapd on node 1 crunches
> > on node 0's pages?
> 
> Without some architecture-neutral method of topology detection, there's
> no way to do this. A follow-up when it's there should fix it.

Sorry, I don't buy that.

a) It does not need to be architecture neutral.  

b) You surely need a way of communicating the discovered topology
   to userspace anyway.

c) $EDITOR /etc/numa-layouf.conf

d) $EDITOR /etc/kswapd.conf
 
> On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> > If I'm not totally out to lunch on this, I'd have thought that a
> > better approach would be
> >       int sys_kswapd(int nid)
> >       {
> >               return kernel_thread(kswapd, ...);
> >       }
> > Userspace could then set up the CPU affinity based on some topology
> > or config information and would then parent a kswapd instance.  That
> > kswapd instance would then be bound to the CPUs which were on the
> > node identified by `nid'.
> > Or something like that?
> 
> I'm very very scared of handing things like that to userspace, largely
> because I don't trust userspace at all.

Me either.  I've seen workloads in which userspace consumes
over 50% of the CPU resources.  It should be banned!

> At this point, we need to enumerate nodes and provide a cpu to node
> correspondence to userspace, and the kernel can obey, aside from the
> question of "What do we do if we need to scan a node without a kswapd
> started yet?".

kswapd is completely optional.  Put a `do_exit(0)' into the current
one and watch.   You'll get crappy dbench numbers, but it stays up.

> I think mbligh recently got the long-needed arch code in
> for cpu to node... But I'm just not able to make the leap of faith that
> memory detection is something that can ever comfortably be given to
> userspace.

A simple syscall which alows you to launch a kswapd instance against
a group of zones on any group of CPUs provides complete generality 
and flexibility to userspace.  And it is architecture neutral.

If it really is incredibly hard to divine the topology from userspace
then you need to fix that up.  Provide the topology to userspace.
Which has the added benefit of providing, umm, the topology to userspace ;)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  5:38       ` Martin J. Bligh
@ 2002-09-13  6:03         ` Andrew Morton
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2002-09-13  6:03 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: William Lee Irwin III, Dave Hansen, linux-kernel, linux-mm

"Martin J. Bligh" wrote:
> 
> ..
> Can we make a simple default of 1 per node, which is what 99%
> of people want, and then make it more complicated later if people
> complain? It's really pretty easy:
> 
> for (node = 0; node < numnodes; ++node) {
>         kswapd = kick_off_kswapd_for_node(node);
>         kswapd->cpus_allowed = node_to_cpus(node);
> }

Seems sane.
 
> Or whatever the current cpus_allowed method is. All we seem to need
> is node_to_cpus ... I can give that to you tommorow with no problem,
> it's trivial.

Tomorrow sounds too early - it'd be nice to get some before-n-after
performance testing to go along with that patch ;)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  4:59   ` William Lee Irwin III
  2002-09-13  5:10     ` Martin J. Bligh
  2002-09-13  5:46     ` Andrew Morton
@ 2002-09-13 13:05     ` Alan Cox
  2002-09-13 21:30       ` William Lee Irwin III
  2002-09-16  5:44     ` [PATCH] per-zone kswapd process Daniel Phillips
  3 siblings, 1 reply; 14+ messages in thread
From: Alan Cox @ 2002-09-13 13:05 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Andrew Morton, Dave Hansen, Martin J. Bligh, linux-kernel, linux-mm

On Fri, 2002-09-13 at 05:59, William Lee Irwin III wrote:
> Machines without observable NUMA effects can benefit from it if it's
> per-zone. It also follows that if there's more than one task doing this,
> page replacement is less likely to block entirely. Last, but not least,
> when I devised it, "per-zone" was the theme.

It will also increase the amount of disk head thrashing surely ?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13 13:05     ` Alan Cox
@ 2002-09-13 21:30       ` William Lee Irwin III
  2002-09-18 16:07         ` [PATCH] recognize MAP_LOCKED in mmap() call Hubertus Franke
  0 siblings, 1 reply; 14+ messages in thread
From: William Lee Irwin III @ 2002-09-13 21:30 UTC (permalink / raw)
  To: Alan Cox
  Cc: Andrew Morton, Dave Hansen, Martin J. Bligh, linux-kernel, linux-mm

On Fri, 2002-09-13 at 05:59, William Lee Irwin III wrote:
>> Machines without observable NUMA effects can benefit from it if it's
>> per-zone. It also follows that if there's more than one task doing this,
>> page replacement is less likely to block entirely. Last, but not least,
>> when I devised it, "per-zone" was the theme.

On Fri, Sep 13, 2002 at 02:05:52PM +0100, Alan Cox wrote:
> It will also increase the amount of disk head thrashing surely ?

I doubt it. Writeout isn't really supposed to happen there in 2.4
either, except under duress. OTOH I've not been doing much with this
directly since rmap10c.


Cheers,
Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-13  4:59   ` William Lee Irwin III
                       ` (2 preceding siblings ...)
  2002-09-13 13:05     ` Alan Cox
@ 2002-09-16  5:44     ` Daniel Phillips
  2002-09-16  7:46       ` William Lee Irwin III
  3 siblings, 1 reply; 14+ messages in thread
From: Daniel Phillips @ 2002-09-16  5:44 UTC (permalink / raw)
  To: William Lee Irwin III, Andrew Morton
  Cc: Dave Hansen, Martin J. Bligh, linux-kernel, linux-mm

On Friday 13 September 2002 06:59, William Lee Irwin III wrote:
> On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> > I still don't see why it's per zone and not per node.  It seems strange
> > that a wee little laptop would be running two kswapds?
> > kswapd can get a ton of work done in the development VM and one per
> > node would, I expect, suffice?
> 
> Machines without observable NUMA effects can benefit from it if it's
> per-zone.

How?

-- 
Daniel
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-16  5:44     ` [PATCH] per-zone kswapd process Daniel Phillips
@ 2002-09-16  7:46       ` William Lee Irwin III
  2002-09-16 15:12         ` Rik van Riel
  0 siblings, 1 reply; 14+ messages in thread
From: William Lee Irwin III @ 2002-09-16  7:46 UTC (permalink / raw)
  To: Daniel Phillips
  Cc: Andrew Morton, Dave Hansen, Martin J. Bligh, linux-kernel, linux-mm

On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
>>> I still don't see why it's per zone and not per node.  It seems strange
>>> that a wee little laptop would be running two kswapds?
>>> kswapd can get a ton of work done in the development VM and one per
>>> node would, I expect, suffice?

On Friday 13 September 2002 06:59, William Lee Irwin III wrote:
>> Machines without observable NUMA effects can benefit from it if it's
>> per-zone.

On Mon, Sep 16, 2002 at 07:44:30AM +0200, Daniel Phillips wrote:
> How?

The notion was that some level of parallelism would be bestowed on the
single-node case by using separate worker threads on a per-zone basis,
as they won't have more than one node to spawn worker threads for at all.

This notion apparently got shot down somewhere, and I don't care to rise
to its defense. I've lost enough debates this release to know better than
to try.


Bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] per-zone kswapd process
  2002-09-16  7:46       ` William Lee Irwin III
@ 2002-09-16 15:12         ` Rik van Riel
  0 siblings, 0 replies; 14+ messages in thread
From: Rik van Riel @ 2002-09-16 15:12 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Daniel Phillips, Andrew Morton, Dave Hansen, Martin J. Bligh,
	linux-kernel, linux-mm

On Mon, 16 Sep 2002, William Lee Irwin III wrote:

> This notion apparently got shot down somewhere, and I don't care to rise
> to its defense. I've lost enough debates this release to know better
> than to try.

Don't worry about this, there are bigger fish around, lower
hanging sea fruit, so to say. ;)

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/

Spamtraps of the month:  september@surriel.com trac@trac.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] recognize MAP_LOCKED in mmap() call
  2002-09-13 21:30       ` William Lee Irwin III
@ 2002-09-18 16:07         ` Hubertus Franke
  2002-09-18 16:29           ` Andrew Morton
  0 siblings, 1 reply; 14+ messages in thread
From: Hubertus Franke @ 2002-09-18 16:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1017 bytes --]


Andrew, at the current time an mmap() ignores a MAP_LOCKED passed to it.
The only way we can get VM_LOCKED associated with the newly created VMA
is to have previously called mlockall() on the process which sets the 
mm->def_flags != VM_LOCKED or subsequently call mlock() on the
newly created VMA.

The attached patch checks for MAP_LOCKED being passed and if so checks
the capabilities of the process. Limit checks were already in place.
-- 
-- Hubertus Franke  (frankeh@watson.ibm.com)

--------------------------------< PATCH >------------------------------
--- linux-2.5.35/mm/mmap.c	Wed Sep 18 11:12:13 2002
+++ linux-2.5.35-fix/mm/mmap.c	Wed Sep 18 11:44:32 2002
@@ -461,6 +461,11 @@
 	 */
 	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
+	if (flags & MAP_LOCKED) {
+		if (!capable(CAP_IPC_LOCK))
+			return -EPERM;
+		vm_flags |= VM_LOCKED;
+	}
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked = mm->locked_vm << PAGE_SHIFT;




[-- Attachment #2: patch.2.5.35.mmap_locked --]
[-- Type: text/x-diff, Size: 452 bytes --]

--- linux-2.5.35/mm/mmap.c	Wed Sep 18 11:12:13 2002
+++ linux-2.5.35-fix/mm/mmap.c	Wed Sep 18 11:44:32 2002
@@ -461,6 +461,11 @@
 	 */
 	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
+	if (flags & MAP_LOCKED) {
+		if (!capable(CAP_IPC_LOCK))
+			return -EPERM;
+		vm_flags |= VM_LOCKED;
+	}
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked = mm->locked_vm << PAGE_SHIFT;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] recognize MAP_LOCKED in mmap() call
  2002-09-18 16:07         ` [PATCH] recognize MAP_LOCKED in mmap() call Hubertus Franke
@ 2002-09-18 16:29           ` Andrew Morton
  0 siblings, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2002-09-18 16:29 UTC (permalink / raw)
  To: frankeh; +Cc: linux-kernel, linux-mm

Hubertus Franke wrote:
> 
> Andrew, at the current time an mmap() ignores a MAP_LOCKED passed to it.
> The only way we can get VM_LOCKED associated with the newly created VMA
> is to have previously called mlockall() on the process which sets the
> mm->def_flags != VM_LOCKED or subsequently call mlock() on the
> newly created VMA.
> 
> The attached patch checks for MAP_LOCKED being passed and if so checks
> the capabilities of the process. Limit checks were already in place.

Looks sane, thanks.

It appears that MAP_LOCKED is a Linux-special, so presumably it
_used_ to work.  I wonder when it broke?

You patch applies to 2.4 as well; it would be useful to give that
a sanity test and send a copy to Marcelo.

(SuS really only anticipates that mmap needs to look at prior mlocks
in force against the address range.  It also says

     Process memory locking does apply to shared memory regions,

and we don't do that either.  I think we should; can't see why SuS
requires this.)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2002-09-18 16:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-13  3:33 [PATCH] per-zone kswapd process Dave Hansen
2002-09-13  4:06 ` Andrew Morton
2002-09-13  4:59   ` William Lee Irwin III
2002-09-13  5:10     ` Martin J. Bligh
2002-09-13  5:46     ` Andrew Morton
2002-09-13  5:38       ` Martin J. Bligh
2002-09-13  6:03         ` Andrew Morton
2002-09-13 13:05     ` Alan Cox
2002-09-13 21:30       ` William Lee Irwin III
2002-09-18 16:07         ` [PATCH] recognize MAP_LOCKED in mmap() call Hubertus Franke
2002-09-18 16:29           ` Andrew Morton
2002-09-16  5:44     ` [PATCH] per-zone kswapd process Daniel Phillips
2002-09-16  7:46       ` William Lee Irwin III
2002-09-16 15:12         ` Rik van Riel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox