linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Ravikiran G Thirumalai <kiran@scalex86.org>
To: Andrew Morton <akpm@osdl.org>
Cc: nickpiggin@yahoo.com.au, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, ak@suse.de, shai@scalex86.org,
	pravin.shelar@calsoftinc.com
Subject: Re: High lock spin time for zone->lru_lock under extreme conditions
Date: Mon, 15 Jan 2007 18:56:44 -0800	[thread overview]
Message-ID: <20070116025644.GA3954@localhost.localdomain> (raw)
In-Reply-To: <20070113132023.0f8d2da8.akpm@osdl.org>

On Sat, Jan 13, 2007 at 01:20:23PM -0800, Andrew Morton wrote:
> 
> Seeing the code helps.

But there was a subtle problem with hold time instrumentation here.
The code assumed the critical section exiting through 
spin_unlock_irq entered critical section with spin_lock_irq, but that
might not be the case always, and the instrumentation for hold time goes bad
when that happens (as in shrink_inactive_list)

> 
> >  The
> > instrumentation goes like this:
> > 
> > void __lockfunc _spin_lock_irq(spinlock_t *lock)
> > {
> >         unsigned long long t1,t2;
> >         local_irq_disable();
> >         t1 = get_cycles_sync();
> >         preempt_disable();
> >         spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> >         _raw_spin_lock(lock);
> >         t2 = get_cycles_sync();
> >         lock->raw_lock.htsc = t2;
> >         if (lock->spin_time < (t2 - t1))
> >                 lock->spin_time = t2 - t1;
> > }
> > ...
> > 
> > void __lockfunc _spin_unlock_irq(spinlock_t *lock)
> > {
> >         unsigned long long t1 ;
> >         spin_release(&lock->dep_map, 1, _RET_IP_);
> >         t1 = get_cycles_sync();
> >         if (lock->cs_time < (t1 -  lock->raw_lock.htsc))
> >                 lock->cs_time = t1 -  lock->raw_lock.htsc;
> >         _raw_spin_unlock(lock);
> >         local_irq_enable();
> >         preempt_enable();
> > }
> > 
...
> 
> OK, now we need to do a dump_stack() each time we discover a new max hold
> time.  That might a bit tricky: the printk code does spinlocking too so
> things could go recursively deadlocky.  Maybe make spin_unlock_irq() return
> the hold time then do:

What I found now after fixing the above is that hold time is not bad --
249461 cycles on the 2.6 GHZ opteron with powernow disabled in the BIOS.
The spin time is still in orders of seconds.

Hence this looks like a hardware fairness issue.

Attaching the instrumentation patch with this email FR.


Index: linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/asm-x86_64/spinlock.h	2007-01-14 22:36:46.694248000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock.h	2007-01-15 15:40:36.554248000 -0800
@@ -6,6 +6,18 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 
+/* Like get_cycles, but make sure the CPU is synchronized. */
+static inline unsigned long long get_cycles_sync2(void)
+{
+	unsigned long long ret;
+	unsigned eax;
+	/* Don't do an additional sync on CPUs where we know
+	   RDTSC is already synchronous. */
+	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
+			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+	rdtscll(ret);
+	return ret;
+}
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  *
@@ -34,6 +46,7 @@ static inline void __raw_spin_lock(raw_s
 		"jle 3b\n\t"
 		"jmp 1b\n"
 		"2:\t" : "=m" (lock->slock) : : "memory");
+	lock->htsc = get_cycles_sync2();
 }
 
 /*
@@ -62,6 +75,7 @@ static inline void __raw_spin_lock_flags
 		"jmp 4b\n"
 		"5:\n\t"
 		: "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
+		lock->htsc = get_cycles_sync2();
 }
 #endif
 
@@ -74,11 +88,16 @@ static inline int __raw_spin_trylock(raw
 		:"=q" (oldval), "=m" (lock->slock)
 		:"0" (0) : "memory");
 
+	if (oldval)
+		lock->htsc = get_cycles_sync2();
 	return oldval > 0;
 }
 
 static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
+	unsigned long long t = get_cycles_sync2();
+	if (lock->hold_time <  t - lock->htsc)
+		lock->hold_time = t - lock->htsc;
 	asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory");
 }
 
Index: linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock_types.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/asm-x86_64/spinlock_types.h	2007-01-14 22:36:46.714248000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock_types.h	2007-01-15 14:23:37.204248000 -0800
@@ -7,9 +7,11 @@
 
 typedef struct {
 	unsigned int slock;
+	unsigned long long hold_time;
+	unsigned long long htsc;
 } raw_spinlock_t;
 
-#define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 1,0,0 }
 
 typedef struct {
 	unsigned int lock;
Index: linux-2.6.20-rc4.spin_instru/include/linux/spinlock.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/linux/spinlock.h	2007-01-14 22:36:48.464248000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/linux/spinlock.h	2007-01-14 22:41:30.964248000 -0800
@@ -231,8 +231,8 @@ do {								\
 # define spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define read_unlock(lock)		__raw_read_unlock(&(lock)->raw_lock)
 # define write_unlock(lock)		__raw_write_unlock(&(lock)->raw_lock)
-# define spin_unlock_irq(lock) \
-    do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+# define spin_unlock_irq(lock) _spin_unlock_irq(lock)
+/*  do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)*/
 # define read_unlock_irq(lock) \
     do { __raw_read_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
 # define write_unlock_irq(lock) \
Index: linux-2.6.20-rc4.spin_instru/include/linux/spinlock_types.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/linux/spinlock_types.h	2006-11-29 13:57:37.000000000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/linux/spinlock_types.h	2007-01-15 14:27:50.664248000 -0800
@@ -19,6 +19,7 @@
 
 typedef struct {
 	raw_spinlock_t raw_lock;
+	unsigned long long spin_time;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -78,7 +79,8 @@ typedef struct {
 				RW_DEP_MAP_INIT(lockname) }
 #else
 # define __SPIN_LOCK_UNLOCKED(lockname) \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,\
+				.spin_time = 0	\
 				SPIN_DEP_MAP_INIT(lockname) }
 #define __RW_LOCK_UNLOCKED(lockname) \
 	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
Index: linux-2.6.20-rc4.spin_instru/kernel/spinlock.c
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/kernel/spinlock.c	2006-11-29 13:57:37.000000000 -0800
+++ linux-2.6.20-rc4.spin_instru/kernel/spinlock.c	2007-01-15 14:29:47.374248000 -0800
@@ -99,10 +99,15 @@ EXPORT_SYMBOL(_spin_lock_irqsave);
 
 void __lockfunc _spin_lock_irq(spinlock_t *lock)
 {
+	unsigned long long t1,t2;
 	local_irq_disable();
+	t1 = get_cycles_sync();
 	preempt_disable();
 	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	_raw_spin_lock(lock);
+ 	t2 = get_cycles_sync();
+ 	if (lock->spin_time < (t2 - t1))
+ 		lock->spin_time = t2 - t1;
 }
 EXPORT_SYMBOL(_spin_lock_irq);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

      reply	other threads:[~2007-01-16  2:56 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-12 16:01 Ravikiran G Thirumalai
2007-01-12 17:03 ` Peter Zijlstra
2007-01-12 19:46 ` Christoph Lameter
2007-01-12 21:25   ` Andrew Morton
2007-01-12 21:40   ` Ravikiran G Thirumalai
2007-01-12 21:45     ` Christoph Lameter
2007-01-13  1:00       ` Ravikiran G Thirumalai
2007-01-13  1:11         ` Andrew Morton
2007-01-13  7:42           ` Ravikiran G Thirumalai
2007-01-13  4:39 ` Nick Piggin
2007-01-13  7:36   ` Ravikiran G Thirumalai
2007-01-13  7:53     ` Nick Piggin
2007-01-13  8:00     ` Andrew Morton
2007-01-13 19:53       ` Ravikiran G Thirumalai
2007-01-13 21:20         ` Andrew Morton
2007-01-16  2:56           ` Ravikiran G Thirumalai [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070116025644.GA3954@localhost.localdomain \
    --to=kiran@scalex86.org \
    --cc=ak@suse.de \
    --cc=akpm@osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=nickpiggin@yahoo.com.au \
    --cc=pravin.shelar@calsoftinc.com \
    --cc=shai@scalex86.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox