linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH]: Option to run cache reap in thread mode
@ 2004-06-16 14:24 Dimitri Sivanich
  2004-06-16 15:29 ` Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-16 14:24 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel

Hi,

In the process of testing per/cpu interrupt response times and CPU availability,
I've found that running cache_reap() as a timer as is done currently results
in some fairly long CPU holdoffs.

I would like to know what others think about running cache_reap() as a low
priority realtime kthread, at least on certain cpus that would be configured
that way (probably configured at boottime initially).  I've been doing some
testing running it this way on CPU's whose activity is mostly restricted to
realtime work (requiring rapid response times).

Here's my first cut at an initial patch for this (there will be other changes
later to set the configuration and to optimize locking in cache_reap()).

Dimitri Sivanich <sivanich@sgi.com>


--- linux/mm/slab.c.orig	2004-06-16 09:09:09.000000000 -0500
+++ linux/mm/slab.c	2004-06-16 09:10:03.000000000 -0500
@@ -91,6 +91,9 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
+#include	<linux/kthread.h>
+#include	<linux/stop_machine.h>
+#include	<linux/syscalls.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -530,8 +533,15 @@ enum {
 	FULL
 } g_cpucache_up;
 
+cpumask_t threaded_reap;	/* CPUs configured to run reap in thread mode */
+
+static DEFINE_PER_CPU(task_t *, reap_thread);
 static DEFINE_PER_CPU(struct timer_list, reap_timers);
 
+static void start_reap_thread(unsigned long cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+static void stop_reap_thread(unsigned long cpu);
+#endif
 static void reap_timer_fnc(unsigned long data);
 static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 static void enable_cpucache (kmem_cache_t *cachep);
@@ -661,11 +671,13 @@ static int __devinit cpuup_callback(stru
 		up(&cache_chain_sem);
 		break;
 	case CPU_ONLINE:
+		start_reap_thread(cpu);
 		start_cpu_timer(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		stop_cpu_timer(cpu);
+		stop_reap_thread(cpu);
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		down(&cache_chain_sem);
@@ -802,6 +814,9 @@ void __init kmem_cache_init(void)
 		up(&cache_chain_sem);
 	}
 
+	/* Initialize to run cache_reap as a timer on all cpus. */
+	cpus_clear(threaded_reap);
+
 	/* Done! */
 	g_cpucache_up = FULL;
 
@@ -2762,6 +2777,63 @@ next:
 	up(&cache_chain_sem);
 }
 
+/**
+ * If cache reap is run in thread mode, we run it from here.
+ */
+static int reap_thread(void * data)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		cache_reap();
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+/**
+ * If the cpu is configured to run in thread mode, start the reap
+ * thread here.
+ */
+static void start_reap_thread(unsigned long cpu)
+{
+	task_t * p;
+	task_t ** rthread = &per_cpu(reap_thread, cpu);
+	struct sched_param param;
+
+	if (!cpu_isset(cpu, threaded_reap)) {
+		*rthread = NULL;
+		return;
+	}
+
+	param.sched_priority = sys_sched_get_priority_min(SCHED_FIFO); 
+
+	p = kthread_create(reap_thread, NULL, "cache_reap/%d",cpu);
+	if (IS_ERR(p))
+		return;
+	*rthread = p;
+	kthread_bind(p, cpu);
+	sys_sched_setscheduler(p->pid, SCHED_FIFO, &param);
+	wake_up_process(p);
+}
+
+/**
+ * If the cpu is running a reap thread, stop it here.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+static void stop_reap_thread(unsigned long cpu)
+{
+	task_t ** rthread = &per_cpu(reap_thread, cpu);
+
+	if (*rthread != NULL) {
+		kthread_stop(*rthread);
+		*rthread = NULL;
+	}
+
+}
+#endif
+
 /*
  * This is a timer handler.  There is one per CPU.  It is called periodially
  * to shrink this CPU's caches.  Otherwise there could be memory tied up
@@ -2770,10 +2842,16 @@ next:
 static void reap_timer_fnc(unsigned long cpu)
 {
 	struct timer_list *rt = &__get_cpu_var(reap_timers);
+	task_t *rthread = __get_cpu_var(reap_thread);
 
 	/* CPU hotplug can drag us off cpu: don't run on wrong CPU */
 	if (!cpu_is_offline(cpu)) {
-		cache_reap();
+		/* If we're not running in thread mode, simply run cache reap */
+		if (likely(rthread == NULL)) {
+			cache_reap();
+		} else {
+			wake_up_process(rthread);
+		}
 		mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu);
 	}
 }
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 14:24 [PATCH]: Option to run cache reap in thread mode Dimitri Sivanich
@ 2004-06-16 15:29 ` Christoph Hellwig
  2004-06-16 16:03   ` Dimitri Sivanich
  2004-06-16 16:46 ` Lori Gilbertson
  2004-06-16 21:30 ` Andrew Morton
  2 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2004-06-16 15:29 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Andrew Morton, linux-mm, linux-kernel

On Wed, Jun 16, 2004 at 09:24:13AM -0500, Dimitri Sivanich wrote:
> Hi,
> 
> In the process of testing per/cpu interrupt response times and CPU availability,
> I've found that running cache_reap() as a timer as is done currently results
> in some fairly long CPU holdoffs.
> 
> I would like to know what others think about running cache_reap() as a low
> priority realtime kthread, at least on certain cpus that would be configured
> that way (probably configured at boottime initially).  I've been doing some
> testing running it this way on CPU's whose activity is mostly restricted to
> realtime work (requiring rapid response times).
> 
> Here's my first cut at an initial patch for this (there will be other changes
> later to set the configuration and to optimize locking in cache_reap()).

YAKT, sigh..  I don't quite understand what you mean with a "holdoff" so
maybe you could explain what problem you see?  You don't like cache_reap
beeing called from timer context?

As for realtime stuff you're probably better off using something like rtlinux,
getting into the hrt or even real strong soft rt busuniness means messing up
the kernel horrible.  Given you're @sgi.com address you probably know what
a freaking mess and maintaince nightmare IRIX has become because of that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 15:29 ` Christoph Hellwig
@ 2004-06-16 16:03   ` Dimitri Sivanich
  2004-06-16 16:07     ` Christoph Hellwig
  0 siblings, 1 reply; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-16 16:03 UTC (permalink / raw)
  To: Christoph Hellwig, Andrew Morton, linux-mm, linux-kernel

On Wed, Jun 16, 2004 at 04:29:34PM +0100, Christoph Hellwig wrote:
> On Wed, Jun 16, 2004 at 09:24:13AM -0500, Dimitri Sivanich wrote:
> > Hi,
> > 
> > In the process of testing per/cpu interrupt response times and CPU availability,
> > I've found that running cache_reap() as a timer as is done currently results
> > in some fairly long CPU holdoffs.
> > 
> > I would like to know what others think about running cache_reap() as a low
> > priority realtime kthread, at least on certain cpus that would be configured
> > that way (probably configured at boottime initially).  I've been doing some
> > testing running it this way on CPU's whose activity is mostly restricted to
> > realtime work (requiring rapid response times).
> > 
> > Here's my first cut at an initial patch for this (there will be other changes
> > later to set the configuration and to optimize locking in cache_reap()).
> 
> YAKT, sigh..  I don't quite understand what you mean with a "holdoff" so
> maybe you could explain what problem you see?  You don't like cache_reap
> beeing called from timer context?

The issue(s) I'm attempting to solve is to achieve more deterministic interrupt
response times on CPU's that have been designated for use as such.  By setting
cache_reap to run as a kthread, the cpu is only unavailable during the time
that irq's are disabled.  By doing this on a cpu that's been restricted from
running most other processes, I have been able to achieve much more
deterministic interrupt response times.

So yes, I don't want cache_reap to be called from timer context when I've
configured a CPU as such.

> 
> As for realtime stuff you're probably better off using something like rtlinux,
> getting into the hrt or even real strong soft rt busuniness means messing up
> the kernel horrible.  Given you're @sgi.com address you probably know what
> a freaking mess and maintaince nightmare IRIX has become because of that.

Keep in mind that it's not like we're trying to achieve fast response times on
all CPU's potentially running any number of processes.

Dimitri Sivanich <sivanich@sgi.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 16:03   ` Dimitri Sivanich
@ 2004-06-16 16:07     ` Christoph Hellwig
  2004-06-16 16:25       ` Jesse Barnes
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2004-06-16 16:07 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: Andrew Morton, linux-mm, linux-kernel

On Wed, Jun 16, 2004 at 11:03:55AM -0500, Dimitri Sivanich wrote:
> On Wed, Jun 16, 2004 at 04:29:34PM +0100, Christoph Hellwig wrote:
> > YAKT, sigh..  I don't quite understand what you mean with a "holdoff" so
> > maybe you could explain what problem you see?  You don't like cache_reap
> > beeing called from timer context?
> 
> The issue(s) I'm attempting to solve is to achieve more deterministic interrupt
> response times on CPU's that have been designated for use as such.  By setting
> cache_reap to run as a kthread, the cpu is only unavailable during the time
> that irq's are disabled.  By doing this on a cpu that's been restricted from
> running most other processes, I have been able to achieve much more
> deterministic interrupt response times.
> 
> So yes, I don't want cache_reap to be called from timer context when I've
> configured a CPU as such.

Well, if you want deterministic interrupt latencies you should go for a realtime OS.
I know Linux is the big thing in the industry, but you're really better off looking
for a small Hard RT OS.  From the OpenSource world eCOS or RTEMS come to mind.  Or even
rtlinux/rtai if you want to run a full linux kernel as idle task.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 16:07     ` Christoph Hellwig
@ 2004-06-16 16:25       ` Jesse Barnes
  2004-06-16 16:51         ` Dimitri Sivanich
  0 siblings, 1 reply; 20+ messages in thread
From: Jesse Barnes @ 2004-06-16 16:25 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dimitri Sivanich, Andrew Morton, linux-mm, linux-kernel

On Wednesday, June 16, 2004 12:07 pm, Christoph Hellwig wrote:
> Well, if you want deterministic interrupt latencies you should go for a
> realtime OS.

Although I don't want to see another kernel thread added as much as the next 
guy, I think that minimizing the amount of time that irqs are turned off is 
probably a good thing in general.  For example, the patch to allow interrupts 
in spin_lock_irq if the lock is already taken is generally a really good 
thing, because even though reducing lock contention should be a goal, locks 
by their very nature are taken sometimes, and allowing other CPUs to get 
useful work done while they're waiting for it is obviously desirable.

> I know Linux is the big thing in the industry, but you're 
> really better off looking for a small Hard RT OS. 

Sure, for some applications, an RTOS is necessary.  But it seems like keeping 
latencies down in Linux is a good thing to do nonetheless.

Can you think of other ways to reduce the length of time that interrupts are 
disabled during cache reaping?  It seems like the cache_reap loop might be a 
candidate for reorganization (though that would probably imply other 
changes).

Thanks,
Jesse
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 14:24 [PATCH]: Option to run cache reap in thread mode Dimitri Sivanich
  2004-06-16 15:29 ` Christoph Hellwig
@ 2004-06-16 16:46 ` Lori Gilbertson
  2004-06-16 16:53   ` Christoph Hellwig
  2004-06-16 21:30 ` Andrew Morton
  2 siblings, 1 reply; 20+ messages in thread
From: Lori Gilbertson @ 2004-06-16 16:46 UTC (permalink / raw)
  To: Dimitri Sivanich, Christoph Hellwig; +Cc: linux-kernel, linux-mm, Andrew Morton

hch wrote:

> Given you're @sgi.com address you probably know what
> a freaking mess and maintaince nightmare IRIX has become because 
> of that.

Hi Chris,

I'm very curious about this comment - wondering what you base it
on?  I'm the engineering manager for IRIX real-time - we have
no open bugs against it and have many customers depending on it.
At least for the last 5 years had very low maintenance cost,
mostly adding features, fixing a couple of bugs and producing new 
releases.  

Perhaps you would be so kind to let me know what led you to
your statement?

Thanks,

Lori Gilbertson

----
  Lori A. Gilbertson             
  Mgr: CoreOS Development Group 
       Silicon Graphics, Inc. (SGI)
Email: loriann@sgi.com
Phone: 651-683-3433
 Page: wwwmn.americas.sgi.com/~irixdev/os_core_sched/


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 16:25       ` Jesse Barnes
@ 2004-06-16 16:51         ` Dimitri Sivanich
  0 siblings, 0 replies; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-16 16:51 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: Christoph Hellwig, Andrew Morton, linux-mm, linux-kernel

On Wed, Jun 16, 2004 at 12:25:11PM -0400, Jesse Barnes wrote:
> On Wednesday, June 16, 2004 12:07 pm, Christoph Hellwig wrote:
> > Well, if you want deterministic interrupt latencies you should go for a
> > realtime OS.
> 
> Although I don't want to see another kernel thread added as much as the next 
> guy, I think that minimizing the amount of time that irqs are turned off is 
> probably a good thing in general.  For example, the patch to allow interrupts 
> in spin_lock_irq if the lock is already taken is generally a really good 
> thing, because even though reducing lock contention should be a goal, locks 
> by their very nature are taken sometimes, and allowing other CPUs to get 
> useful work done while they're waiting for it is obviously desirable.
> 
> > I know Linux is the big thing in the industry, but you're 
> > really better off looking for a small Hard RT OS. 
> 
> Sure, for some applications, an RTOS is necessary.  But it seems like keeping 
> latencies down in Linux is a good thing to do nonetheless.
> 
> Can you think of other ways to reduce the length of time that interrupts are 
> disabled during cache reaping?  It seems like the cache_reap loop might be a 
> candidate for reorganization (though that would probably imply other 
> changes).

I have another patch forthcoming that does some reorganizing of the locking.
With the two patches I see substantial improvement.

> 
> Thanks,
> Jesse
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 16:46 ` Lori Gilbertson
@ 2004-06-16 16:53   ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2004-06-16 16:53 UTC (permalink / raw)
  To: Lori Gilbertson
  Cc: Dimitri Sivanich, Christoph Hellwig, linux-kernel, linux-mm,
	Andrew Morton, lm

On Wed, Jun 16, 2004 at 11:46:24AM -0500, Lori Gilbertson wrote:
> hch wrote:
> 
> > Given you're @sgi.com address you probably know what
> > a freaking mess and maintaince nightmare IRIX has become because 
> > of that.
> 
> Hi Chris,
> 
> I'm very curious about this comment - wondering what you base it
> on?  I'm the engineering manager for IRIX real-time - we have
> no open bugs against it and have many customers depending on it.
> At least for the last 5 years had very low maintenance cost,
> mostly adding features, fixing a couple of bugs and producing new 
> releases.  
> 
> Perhaps you would be so kind to let me know what led you to
> your statement?

Looks at the overhead of the normal IRIX sleeping locks vs linux spinlock
(and the priority inversion and sleeping locks arguments are the next one
I'll get from you I bet :)), talk to Jeremy how the HBA performance went
down when he had to switch the drivers to the sleeping locks, look at the
complexity of the irix scheduler with it's gazillions of special cases
(and yes, I think the current Linux scheduler is already to complex), or
the big mess with interrupt thread.  

I've added Larry to the Cc list because he knows the IRIX internals much
better than I do (or at least did once) and has been warning of this move
that adds complexity to no end for all the special cases for at least five
years.  He also had some nice IRIX vs Linux benchmarks when Linux on Indys
was new.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 14:24 [PATCH]: Option to run cache reap in thread mode Dimitri Sivanich
  2004-06-16 15:29 ` Christoph Hellwig
  2004-06-16 16:46 ` Lori Gilbertson
@ 2004-06-16 21:30 ` Andrew Morton
  2 siblings, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2004-06-16 21:30 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: linux-mm, linux-kernel

Dimitri Sivanich <sivanich@sgi.com> wrote:
>
> In the process of testing per/cpu interrupt response times and CPU availability,
> I've found that running cache_reap() as a timer as is done currently results
> in some fairly long CPU holdoffs.

Before patching anything I want to understand what's going on in there. 
Please share your analysis.


How long?

How many objects?

Which slab?

Why?

Thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-18 21:04             ` Manfred Spraul
  2004-06-18 21:44               ` Dimitri Sivanich
@ 2004-06-18 22:03               ` Andrew Morton
  1 sibling, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2004-06-18 22:03 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: sivanich, linux-kernel, lse-tech, linux-mm

Manfred Spraul <manfred@colorfullife.com> wrote:
>
> I'll write something:
> - allow to disable the DMA kmalloc caches for archs that do not need them.
> - increase the timer frequency and scan only a few caches in each timer.
> - perhaps a quicker test for cache_reap to notice that nothing needs to 
> be done. Right now four tests are done (!flags & _NO_REAP, 
> ac->touched==0, ac->avail != 0, global timer not yet expired). It's 
> possible to skip some tests. e.g. move the _NO_REAP caches on a separate 
> list, replace the time_after(.next_reap,jiffies) with a separate timer.

Come to think of it, replacing the timer with schedule_delayed_work() and
doing it all via keventd should work OK.  Doing everything in a single pass
is the most CPU-efficient way of doing it, and as long as we're preemptible
and interruptible the latency issues will be solved.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-18 21:04             ` Manfred Spraul
@ 2004-06-18 21:44               ` Dimitri Sivanich
  2004-06-18 22:03               ` Andrew Morton
  1 sibling, 0 replies; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-18 21:44 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Andrew Morton, Dimitri Sivanich, linux-kernel, lse-tech, linux-mm

> 
> I'll write something:
> - allow to disable the DMA kmalloc caches for archs that do not need them.
> - increase the timer frequency and scan only a few caches in each timer.
> - perhaps a quicker test for cache_reap to notice that nothing needs to 
> be done. Right now four tests are done (!flags & _NO_REAP, 
> ac->touched==0, ac->avail != 0, global timer not yet expired). It's 
> possible to skip some tests. e.g. move the _NO_REAP caches on a separate 
> list, replace the time_after(.next_reap,jiffies) with a separate timer.
> 
> --
>     Manfred
>
Thanks for addressing this.  Sounds like some good improvements overall.

One question though:  What about possible spinlock contention issues in the
cache_reap timer processing, or is that unlikely here (even on a heavily loaded
system with a large number of CPUs)?

Dimitri Sivanich <sivanich@sgi.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-18 20:40           ` Andrew Morton
@ 2004-06-18 21:04             ` Manfred Spraul
  2004-06-18 21:44               ` Dimitri Sivanich
  2004-06-18 22:03               ` Andrew Morton
  0 siblings, 2 replies; 20+ messages in thread
From: Manfred Spraul @ 2004-06-18 21:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Dimitri Sivanich, linux-kernel, lse-tech, linux-mm

Andrew Morton wrote:

>Dimitri Sivanich <sivanich@sgi.com> wrote:
>  
>
>>At the time of the holdoff (the point where we've spent a total of 30 usec in
>>the timer_interrupt), we've looped through more than 100 of the 131 caches,
>>usually closer to 120.
>>    
>>
>
>ahh, ooh, ow, of course.
>
>Manfred, we need a separate list of "slabs which might need reaping".
>  
>
A cache that might need reaping is a cache that has seen at least one 
kmem_cache_free(). The list would trade less time in the timer context 
at the expense of slower kmem_cache_free calls. I'm fairly certain that 
this would be end up as a big net loss.

>That'll help the average case.  To help the worst case we should change
>cache_reap() to only reap (say) ten caches from the head of the new list
>and to then return.
>
I'll write something:
- allow to disable the DMA kmalloc caches for archs that do not need them.
- increase the timer frequency and scan only a few caches in each timer.
- perhaps a quicker test for cache_reap to notice that nothing needs to 
be done. Right now four tests are done (!flags & _NO_REAP, 
ac->touched==0, ac->avail != 0, global timer not yet expired). It's 
possible to skip some tests. e.g. move the _NO_REAP caches on a separate 
list, replace the time_after(.next_reap,jiffies) with a separate timer.

--
    Manfred
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-18 14:33         ` Dimitri Sivanich
@ 2004-06-18 20:40           ` Andrew Morton
  2004-06-18 21:04             ` Manfred Spraul
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2004-06-18 20:40 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: manfred, linux-kernel, lse-tech, linux-mm

Dimitri Sivanich <sivanich@sgi.com> wrote:
>
> At the time of the holdoff (the point where we've spent a total of 30 usec in
> the timer_interrupt), we've looped through more than 100 of the 131 caches,
> usually closer to 120.

ahh, ooh, ow, of course.

Manfred, we need a separate list of "slabs which might need reaping".

That'll help the average case.  To help the worst case we should change
cache_reap() to only reap (say) ten caches from the head of the new list
and to then return.  Maybe increase the timer frequency too.

something like:

/*
 * FIFO list of caches (via kmem_cache_t.reap_list) which need consideration in
 * cache_reap().  Protected by cache_chain_sem.
 */
static LIST_HEAD(global_reap_list);

cache_reap()
{
	for (i = 0; i < 10; i++) {
		if (list_empty(&global_reap_list))
			break;
		cachep = list_entry(&global_reap_list, kmem_cache_t, reap_list);
		list_del_init(&cachep->reap_list);
		<prune it>
	}
}

mark_cache_for_reaping(kmem_cach_t *cachep)
{
	if (list_empty(&cachep->reap_list)) {
		if (!down_trylock(&cache_chain_sem)) {
			list_add(&cachep->reap_list, &global_reap_list);
			up(&cache_chain_sem);
	}
}

Maybe cache_chain_sem should become a spinlock.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-18  4:40       ` Andrew Morton
@ 2004-06-18 14:33         ` Dimitri Sivanich
  2004-06-18 20:40           ` Andrew Morton
  0 siblings, 1 reply; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-18 14:33 UTC (permalink / raw)
  To: Andrew Morton, manfred; +Cc: linux-kernel, lse-tech, linux-mm

On Thu, Jun 17, 2004 at 09:40:35PM -0700, Andrew Morton wrote:
> Against which slab cache?  How many objects are being reaped in a single
> timer tick?

Against all of them.  See my analysis below.

> 
> It's very simple:
> 
> --- 25/mm/slab.c~a	2004-06-17 21:38:57.728796976 -0700
> +++ 25-akpm/mm/slab.c	2004-06-17 21:40:06.294373424 -0700
> @@ -2690,6 +2690,7 @@ static void drain_array_locked(kmem_cach
>  static inline void cache_reap (void)
>  {
>  	struct list_head *walk;
> +	static int max;
>  
>  #if DEBUG
>  	BUG_ON(!in_interrupt());
> @@ -2731,6 +2732,11 @@ static inline void cache_reap (void)
>  		}
>  
>  		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
> +		if (tofree > max) {
> +			max = tofree;
> +			printk("%s: reap %d\n", searchp->name, tofree);
> +		}
> +
>  		do {
>  			p = list3_data(searchp)->slabs_free.next;
>  			if (p == &(list3_data(searchp)->slabs_free))
> _

Andrew & Manfred, here's what I think you've been looking for.  Keep in mind
that this analysis is done on an essentially unloaded system:

At the time of the holdoff (the point where we've spent a total of 30 usec in
the timer_interrupt), we've looped through more than 100 of the 131 caches,
usually closer to 120.

The maximum number of slabs to free (what Andrew indicated he wanted) is
almost always 1 or 2 at the time of the holdoff, and the same for the number
of per/cpu blocks.

The total number of slabs we've freed is usually on the order of 1-3.

The total number of per/cpu blocks freed is usually on the order of 1-5 but
as high as 15.

The total number of times we've acquired the spin_lock is usually on the
order of 5-10.

Below I've included the slabinfo for your reading pleasure.

slabinfo - version: 2.0
# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <batchcount> <limit> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
ip_fib_hash           11    452     32  452    1 : tunables  120   60    8 : slabdata      1      1      0
rpc_buffers            8      8   2048    8    1 : tunables   24   12    8 : slabdata      1      1      0
rpc_tasks              8     42    384   42    1 : tunables   54   27    8 : slabdata      1      1      0
rpc_inode_cache        0      0    896   18    1 : tunables   54   27    8 : slabdata      0      0      0
unix_sock             29     84    768   21    1 : tunables   54   27    8 : slabdata      4      4      0
ip_mrt_cache           0      0    128  123    1 : tunables  120   60    8 : slabdata      0      0      0
tcp_tw_bucket          7     62    256   62    1 : tunables  120   60    8 : slabdata      1      1      0
tcp_bind_bucket       15    452     32  452    1 : tunables  120   60    8 : slabdata      1      1      0
tcp_open_request       0      0    128  123    1 : tunables  120   60    8 : slabdata      0      0      0
inet_peer_cache        3    123    128  123    1 : tunables  120   60    8 : slabdata      1      1      0
secpath_cache          0      0    256   62    1 : tunables  120   60    8 : slabdata      0      0      0
xfrm_dst_cache         0      0    384   42    1 : tunables   54   27    8 : slabdata      0      0      0
ip_dst_cache          30     84    384   42    1 : tunables   54   27    8 : slabdata      2      2      0
arp_cache              4     62    256   62    1 : tunables  120   60    8 : slabdata      1      1      0
raw4_sock              0      0    896   18    1 : tunables   54   27    8 : slabdata      0      0      0
udp_sock               3     18    896   18    1 : tunables   54   27    8 : slabdata      1      1      0
tcp_sock              28     44   1408   11    1 : tunables   24   12    8 : slabdata      4      4      0
flow_cache             0      0    128  123    1 : tunables  120   60    8 : slabdata      0      0      0
kcopyd-jobs          512    528    360   44    1 : tunables   54   27    8 : slabdata     12     12      0
dm_tio                 0      0     24  581    1 : tunables  120   60    8 : slabdata      0      0      0
dm_io                  0      0     32  452    1 : tunables  120   60    8 : slabdata      0      0      0
scsi_cmd_cache        34    124    512   31    1 : tunables   54   27    8 : slabdata      4      4      1
qla2xxx_srbs           0      0    256   62    1 : tunables  120   60    8 : slabdata      0      0      0
dm_session             0      0    416   38    1 : tunables   54   27    8 : slabdata      0      0      0
dm_fsreg               0      0    336   48    1 : tunables   54   27    8 : slabdata      0      0      0
dm_tokdata             0      0     72  214    1 : tunables  120   60    8 : slabdata      0      0      0
xfs_acl                0      0    304   53    1 : tunables   54   27    8 : slabdata      0      0      0
xfs_chashlist        944   1356     32  452    1 : tunables  120   60    8 : slabdata      3      3      0
xfs_ili              137    332    192   83    1 : tunables  120   60    8 : slabdata      4      4      0
xfs_ifork              0      0     64  240    1 : tunables  120   60    8 : slabdata      0      0      0
xfs_efi_item           0      0    352   45    1 : tunables   54   27    8 : slabdata      0      0      0
xfs_efd_item           0      0    360   44    1 : tunables   54   27    8 : slabdata      0      0      0
xfs_buf_item           6    172    184   86    1 : tunables  120   60    8 : slabdata      2      2      0
xfs_dabuf             16    581     24  581    1 : tunables  120   60    8 : slabdata      1      1      0
xfs_da_state          16     33    488   33    1 : tunables   54   27    8 : slabdata      1      1      0
xfs_trans              1     72    872   18    1 : tunables   54   27    8 : slabdata      1      4      0
xfs_inode          19458  19560    528   30    1 : tunables   54   27    8 : slabdata    652    652      0
xfs_btree_cur         16     83    192   83    1 : tunables  120   60    8 : slabdata      1      1      0
xfs_bmap_free_item      0      0     24  581    1 : tunables  120   60    8 : slabdata      0      0      0
xfs_buf_t             36    217    512   31    1 : tunables   54   27    8 : slabdata      7      7      2
linvfs_icache      19472  19488    768   21    1 : tunables   54   27    8 : slabdata    928    928      0
nfs_write_data        36     42    768   21    1 : tunables   54   27    8 : slabdata      2      2      0
nfs_read_data         32     42    768   21    1 : tunables   54   27    8 : slabdata      2      2      0
nfs_inode_cache        0      0   1024   15    1 : tunables   54   27    8 : slabdata      0      0      0
nfs_page               0      0    256   62    1 : tunables  120   60    8 : slabdata      0      0      0
isofs_inode_cache      0      0    640   25    1 : tunables   54   27    8 : slabdata      0      0      0
fat_inode_cache        0      0    768   21    1 : tunables   54   27    8 : slabdata      0      0      0
hugetlbfs_inode_cache      1     21    768   21    1 : tunables   54   27    8 : slabdata      1      1      0
ext2_inode_cache       0      0    768   21    1 : tunables   54   27    8 : slabdata      0      0      0
ext2_xattr             0      0     88  177    1 : tunables  120   60    8 : slabdata      0      0      0
journal_handle         0      0     48  312    1 : tunables  120   60    8 : slabdata      0      0      0
journal_head           0      0     88  177    1 : tunables  120   60    8 : slabdata      0      0      0
revoke_table           0      0     16  816    1 : tunables  120   60    8 : slabdata      0      0      0
revoke_record          0      0     32  452    1 : tunables  120   60    8 : slabdata      0      0      0
ext3_inode_cache       0      0    896   18    1 : tunables   54   27    8 : slabdata      0      0      0
ext3_xattr             0      0     88  177    1 : tunables  120   60    8 : slabdata      0      0      0
dquot                  0      0    256   62    1 : tunables  120   60    8 : slabdata      0      0      0
eventpoll_pwq          0      0     72  214    1 : tunables  120   60    8 : slabdata      0      0      0
eventpoll_epi          0      0    256   62    1 : tunables  120   60    8 : slabdata      0      0      0
kioctx                 0      0    384   42    1 : tunables   54   27    8 : slabdata      0      0      0
kiocb                  0      0    512   31    1 : tunables   54   27    8 : slabdata      0      0      0
dnotify_cache          0      0     40  371    1 : tunables  120   60    8 : slabdata      0      0      0
file_lock_cache        2    188    168   94    1 : tunables  120   60    8 : slabdata      2      2      0
fasync_cache           0      0     24  581    1 : tunables  120   60    8 : slabdata      0      0      0
shared_policy_node      0      0     56  272    1 : tunables  120   60    8 : slabdata      0      0      0
numa_policy            0      0     40  371    1 : tunables  120   60    8 : slabdata      0      0      0
shmem_inode_cache      4     18    896   18    1 : tunables   54   27    8 : slabdata      1      1      0
posix_timers_cache      0      0    144  110    1 : tunables  120   60    8 : slabdata      0      0      0
uid_cache              4    240     64  240    1 : tunables  120   60    8 : slabdata      1      1      0
sgpool-128            32     32   4096    4    1 : tunables   24   12    8 : slabdata      8      8      0
sgpool-64             32     32   2048    8    1 : tunables   24   12    8 : slabdata      4      4      0
sgpool-32             32     45   1024   15    1 : tunables   54   27    8 : slabdata      3      3      0
sgpool-16             32     62    512   31    1 : tunables   54   27    8 : slabdata      2      2      0
sgpool-8             141    310    256   62    1 : tunables  120   60    8 : slabdata      5      5      1
cfq_pool             145    272     56  272    1 : tunables  120   60    8 : slabdata      1      1      0
crq_pool             129    428     72  214    1 : tunables  120   60    8 : slabdata      2      2      7
deadline_drq           0      0     96  162    1 : tunables  120   60    8 : slabdata      0      0      0
as_arq                 0      0    112  140    1 : tunables  120   60    8 : slabdata      0      0      0
blkdev_requests       65    180    264   60    1 : tunables   54   27    8 : slabdata      3      3      4
biovec-BIO_MAX_PAGES    256    256   4096    4    1 : tunables   24   12    8 : slabdata     64     64      0
biovec-128           256    256   2048    8    1 : tunables   24   12    8 : slabdata     32     32      0
biovec-64            256    270   1024   15    1 : tunables   54   27    8 : slabdata     18     18      0
biovec-16            256    310    256   62    1 : tunables  120   60    8 : slabdata      5      5      0
biovec-4             256    480     64  240    1 : tunables  120   60    8 : slabdata      2      2      0
biovec-1             334    816     16  816    1 : tunables  120   60    8 : slabdata      1      1      1
bio                  340    615    128  123    1 : tunables  120   60    8 : slabdata      5      5      3
sock_inode_cache      69     84    768   21    1 : tunables   54   27    8 : slabdata      4      4      0
skbuff_head_cache    414    462    384   42    1 : tunables   54   27    8 : slabdata     11     11      0
sock                   3     50    640   25    1 : tunables   54   27    8 : slabdata      2      2      0
proc_inode_cache    1112   1200    640   25    1 : tunables   54   27    8 : slabdata     48     48      0
sigqueue             170    297    160   99    1 : tunables  120   60    8 : slabdata      3      3      0
radix_tree_node     1063   1110    536   30    1 : tunables   54   27    8 : slabdata     37     37      0
bdev_cache             4     18    896   18    1 : tunables   54   27    8 : slabdata      1      1      0
mnt_cache             35    123    128  123    1 : tunables  120   60    8 : slabdata      1      1      0
inode_cache         2347   2400    640   25    1 : tunables   54   27    8 : slabdata     96     96      0
dentry_cache       24571  24614    256   62    1 : tunables  120   60    8 : slabdata    397    397      0
filp                 760    868    256   62    1 : tunables  120   60    8 : slabdata     14     14      0
names_cache           51     68   4096    4    1 : tunables   24   12    8 : slabdata     17     17     18
idr_layer_cache       26     30    528   30    1 : tunables   54   27    8 : slabdata      1      1      0
buffer_head          464    648     96  162    1 : tunables  120   60    8 : slabdata      4      4      0
mm_struct            136    195   1024   15    1 : tunables   54   27    8 : slabdata     13     13      0
vm_area_struct      1933   2070    176   90    1 : tunables  120   60    8 : slabdata     23     23    224
fs_cache             111    480     64  240    1 : tunables  120   60    8 : slabdata      2      2      0
files_cache           75    180    896   18    1 : tunables   54   27    8 : slabdata     10     10      0
signal_cache         171    369    128  123    1 : tunables  120   60    8 : slabdata      3      3      0
sighand_cache        125    135   1664    9    1 : tunables   24   12    8 : slabdata     15     15      0
anon_vma             944   1162     24  581    1 : tunables  120   60    8 : slabdata      2      2      0
size-131072(DMA)       0      0 131072    1    8 : tunables    8    4    0 : slabdata      0      0      0
size-131072            0      0 131072    1    8 : tunables    8    4    0 : slabdata      0      0      0
size-65536(DMA)        0      0  65536    1    4 : tunables    8    4    0 : slabdata      0      0      0
size-65536             0      0  65536    1    4 : tunables    8    4    0 : slabdata      0      0      0
size-32768(DMA)        0      0  32768    1    2 : tunables    8    4    0 : slabdata      0      0      0
size-32768             9      9  32768    1    2 : tunables    8    4    0 : slabdata      9      9      0
size-16384(DMA)        0      0  16384    1    1 : tunables   24   12    8 : slabdata      0      0      0
size-16384            12     12  16384    1    1 : tunables   24   12    8 : slabdata     12     12      0
size-8192(DMA)         0      0   8192    2    1 : tunables   24   12    8 : slabdata      0      0      0
size-8192             33     34   8192    2    1 : tunables   24   12    8 : slabdata     17     17      0
size-4096(DMA)         0      0   4096    4    1 : tunables   24   12    8 : slabdata      0      0      0
size-4096            197    236   4096    4    1 : tunables   24   12    8 : slabdata     59     59      0
size-2048(DMA)         0      0   2048    8    1 : tunables   24   12    8 : slabdata      0      0      0
size-2048            339    368   2048    8    1 : tunables   24   12    8 : slabdata     46     46      0
size-1024(DMA)         0      0   1024   15    1 : tunables   54   27    8 : slabdata      0      0      0
size-1024            418    465   1024   15    1 : tunables   54   27    8 : slabdata     31     31      0
size-512(DMA)          0      0    512   31    1 : tunables   54   27    8 : slabdata      0      0      0
size-512             863    868    512   31    1 : tunables   54   27    8 : slabdata     28     28      0
size-256(DMA)          0      0    256   62    1 : tunables  120   60    8 : slabdata      0      0      0
size-256            1672   1674    256   62    1 : tunables  120   60    8 : slabdata     27     27      0
size-128(DMA)          0      0    128  123    1 : tunables  120   60    8 : slabdata      0      0      0
size-128             585    738    128  123    1 : tunables  120   60    8 : slabdata      6      6      0
size-64(DMA)           0      0     64  240    1 : tunables  120   60    8 : slabdata      0      0      0
size-64             4001   4080     64  240    1 : tunables  120   60    8 : slabdata     17     17      0
kmem_cache           148    154    704   22    1 : tunables   54   27    8 : slabdata      7      7      0
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-17 13:10     ` Dimitri Sivanich
@ 2004-06-18  4:40       ` Andrew Morton
  2004-06-18 14:33         ` Dimitri Sivanich
  0 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2004-06-18  4:40 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: manfred, linux-kernel, lse-tech, linux-mm

Dimitri Sivanich <sivanich@sgi.com> wrote:
>
> On Wed, Jun 16, 2004 at 08:58:58PM +0200, Manfred Spraul wrote:
>  > Could you try to reduce them? Something like (as root)
>  > 
>  > # cd /proc
>  > # cat slabinfo | gawk '{printf("echo \"%s %d %d %d\" > 
>  > /proc/slabinfo\n", $1,$9,4,2);}' | bash
>  > 
>  > If this doesn't help then perhaps the timer should run more frequently 
>  > and scan only a part of the list of slab caches.
> 
>  I tried the modification you suggested and it had little effect.  On a 4 cpu
>  (otherwise idle) system I saw the characteristic 30+ usec interruptions
>  (holdoffs) every 2 seconds.

Against which slab cache?  How many objects are being reaped in a single
timer tick?

It's very simple:

--- 25/mm/slab.c~a	2004-06-17 21:38:57.728796976 -0700
+++ 25-akpm/mm/slab.c	2004-06-17 21:40:06.294373424 -0700
@@ -2690,6 +2690,7 @@ static void drain_array_locked(kmem_cach
 static inline void cache_reap (void)
 {
 	struct list_head *walk;
+	static int max;
 
 #if DEBUG
 	BUG_ON(!in_interrupt());
@@ -2731,6 +2732,11 @@ static inline void cache_reap (void)
 		}
 
 		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+		if (tofree > max) {
+			max = tofree;
+			printk("%s: reap %d\n", searchp->name, tofree);
+		}
+
 		do {
 			p = list3_data(searchp)->slabs_free.next;
 			if (p == &(list3_data(searchp)->slabs_free))
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 18:58   ` Manfred Spraul
@ 2004-06-17 13:10     ` Dimitri Sivanich
  2004-06-18  4:40       ` Andrew Morton
  0 siblings, 1 reply; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-17 13:10 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: linux-kernel, lse-tech, linux-mm

Manfred,

On Wed, Jun 16, 2004 at 08:58:58PM +0200, Manfred Spraul wrote:
> Could you try to reduce them? Something like (as root)
> 
> # cd /proc
> # cat slabinfo | gawk '{printf("echo \"%s %d %d %d\" > 
> /proc/slabinfo\n", $1,$9,4,2);}' | bash
> 
> If this doesn't help then perhaps the timer should run more frequently 
> and scan only a part of the list of slab caches.

I tried the modification you suggested and it had little effect.  On a 4 cpu
(otherwise idle) system I saw the characteristic 30+ usec interruptions
(holdoffs) every 2 seconds.

Since it's running in timer context, this of course includes all of the
timer_interrupt logic, but I've verified no other timers running during those
times (and I see only very short holdoffs during other timer interrupts).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
  2004-06-16 18:02 ` Dimitri Sivanich
@ 2004-06-16 18:58   ` Manfred Spraul
  2004-06-17 13:10     ` Dimitri Sivanich
  0 siblings, 1 reply; 20+ messages in thread
From: Manfred Spraul @ 2004-06-16 18:58 UTC (permalink / raw)
  To: Dimitri Sivanich; +Cc: linux-kernel, lse-tech, linux-mm

Dimitri Sivanich wrote:

>>Do you use the default batchcount values or have you increased the values?
>>    
>>
>
>Default.
>
>  
>
Could you try to reduce them? Something like (as root)

# cd /proc
# cat slabinfo | gawk '{printf("echo \"%s %d %d %d\" > 
/proc/slabinfo\n", $1,$9,4,2);}' | bash

If this doesn't help then perhaps the timer should run more frequently 
and scan only a part of the list of slab caches.

--
    Manfred
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
       [not found] ` <m3r7sfmq0r.fsf@averell.firstfloor.org>
@ 2004-06-16 18:16   ` Dimitri Sivanich
  0 siblings, 0 replies; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-16 18:16 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Manfred Spraul, linux-kernel, linux-mm, lse-tech

On Wed, Jun 16, 2004 at 06:22:28PM +0200, Andi Kleen wrote:
> Dimitri Sivanich <sivanich@sgi.com> writes:
> 
> > I would like to know what others think about running cache_reap() as a low
> > priority realtime kthread, at least on certain cpus that would be configured
> > that way (probably configured at boottime initially).  I've been doing some
> > testing running it this way on CPU's whose activity is mostly restricted to
> > realtime work (requiring rapid response times).
> >
> > Here's my first cut at an initial patch for this (there will be other changes
> > later to set the configuration and to optimize locking in cache_reap()).
> 
> I would run it in the standard work queue threads. We already have 
> too many kernel threads, no need to add more.
> 
> Also is there really a need for it to be real time? 

Not especially.  Normal time sharing would be OK with me, but I'd like to
emphasize that if it is real time, it would need to be lowest priority.

> Note that we don't make any attempt at all in the linux kernel to handle
> lock priority inversion, so this isn't an argument.
> 
> -Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
       [not found] <40D08225.6060900@colorfullife.com>
@ 2004-06-16 18:02 ` Dimitri Sivanich
  2004-06-16 18:58   ` Manfred Spraul
  0 siblings, 1 reply; 20+ messages in thread
From: Dimitri Sivanich @ 2004-06-16 18:02 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: linux-kernel, lse-tech, linux-mm

On Wed, Jun 16, 2004 at 07:23:49PM +0200, Manfred Spraul wrote:
> Dimitri wrote:
> 
> >In the process of testing per/cpu interrupt response times and CPU 
> >availability,
> >I've found that running cache_reap() as a timer as is done currently 
> >results
> >in some fairly long CPU holdoffs.
> >
> What is fairly long?
Into the 100's of usec.  I consider anything over 30 usec too long.
I've seen this take longer than 30usec on a small (8p) system.

> If cache_reap() is slow than the caches are too large.
> Could you limit cachep->free_limit and check if that helps? It's right 
> now scaled by num_online_cpus() - that's probably too much. It's 
> unlikely that all 500 cpus will try to refill their cpu arrays at the 
> same time. Something like a logarithmic increase should be sufficient.

I haven't tried this yet, but I'm even seeing this on 4 cpu systems.

> Do you use the default batchcount values or have you increased the values?

Default.

> I think the sgi ia64 system do not work with slab debugging, but please 
> check that debugging is off. Debug enabled is slow.

# CONFIG_DEBUG_SLAB is not set

> 
> --
>    Manfred

Dimitri Sivanich <sivanich@sgi.com>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH]: Option to run cache reap in thread mode
@ 2004-06-16 16:43 Mark_H_Johnson
  0 siblings, 0 replies; 20+ messages in thread
From: Mark_H_Johnson @ 2004-06-16 16:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Andrew Morton, linux-kernel, linux-mm, Dimitri Sivanich




>Well, if you want deterministic interrupt latencies you should go for a
realtime OS.
>I know Linux is the big thing in the industry, but you're really better
off looking
>for a small Hard RT OS.  From the OpenSource world eCOS or RTEMS come to
mind.  Or even
>rtlinux/rtai if you want to run a full linux kernel as idle task.

I don't think the OP wants to run RT on Linux but has customers who want to
do so. Customers of course, are a pretty stubborn bunch and will demand to
use the system in ways you consider inappropriate. You may be arguing with
the wrong guy.

Getting back to the previous comment as well
>YAKT, sigh..  I don't quite understand what you mean with a "holdoff" so
>maybe you could explain what problem you see?  You don't like cache_reap
>beeing called from timer context?

Are you concerned so much about the proliferation of kernel threads or the
fact that this function is getting moved from the timer context to a
thread?

If the first case - one could argue that we don't need separate threads
titled
 - migration
 - ksoftirq
 - events
 - kblockd
 - aio
and so on [and now cache_reap], one per CPU if there was a mechanism to
schedule work to be done on a regular basis on each CPU. Perhaps this patch
should be modified to work with one of these existing kernel threads
instead (or collapse a few of these into a "janitor thread" per CPU).

If the second case, can you explain the rationale for the concern more
fully. I would expect moving stuff out of the timer context would be a
"good thing" for most if not all systems - not just those wanting good real
time response.

--Mark H Johnson
  <mailto:Mark_H_Johnson@raytheon.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2004-06-18 22:03 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-06-16 14:24 [PATCH]: Option to run cache reap in thread mode Dimitri Sivanich
2004-06-16 15:29 ` Christoph Hellwig
2004-06-16 16:03   ` Dimitri Sivanich
2004-06-16 16:07     ` Christoph Hellwig
2004-06-16 16:25       ` Jesse Barnes
2004-06-16 16:51         ` Dimitri Sivanich
2004-06-16 16:46 ` Lori Gilbertson
2004-06-16 16:53   ` Christoph Hellwig
2004-06-16 21:30 ` Andrew Morton
2004-06-16 16:43 Mark_H_Johnson
     [not found] <40D08225.6060900@colorfullife.com>
2004-06-16 18:02 ` Dimitri Sivanich
2004-06-16 18:58   ` Manfred Spraul
2004-06-17 13:10     ` Dimitri Sivanich
2004-06-18  4:40       ` Andrew Morton
2004-06-18 14:33         ` Dimitri Sivanich
2004-06-18 20:40           ` Andrew Morton
2004-06-18 21:04             ` Manfred Spraul
2004-06-18 21:44               ` Dimitri Sivanich
2004-06-18 22:03               ` Andrew Morton
     [not found] <27JKg-4ht-11@gated-at.bofh.it>
     [not found] ` <m3r7sfmq0r.fsf@averell.firstfloor.org>
2004-06-16 18:16   ` Dimitri Sivanich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox