linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Andy Lutomirski <luto@kernel.org>
To: x86@kernel.org, linux-kernel@vger.kernel.org
Cc: Borislav Petkov <bp@alien8.de>, Brian Gerst <brgerst@gmail.com>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Oleg Nesterov <oleg@redhat.com>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	Andy Lutomirski <luto@kernel.org>,
	stable@vger.kernel.org
Subject: [RFC 05/13] x86/mm: Add barriers and document switch_mm-vs-flush synchronization
Date: Fri,  8 Jan 2016 15:15:23 -0800	[thread overview]
Message-ID: <95a853538da28c64dfc877c60549ec79ed7a5d69.1452294700.git.luto@kernel.org> (raw)
In-Reply-To: <cover.1452294700.git.luto@kernel.org>
In-Reply-To: <cover.1452294700.git.luto@kernel.org>

When switch_mm activates a new pgd, it also sets a bit that tells
other CPUs that the pgd is in use so that tlb flush IPIs will be
sent.  In order for that to work correctly, the bit needs to be
visible prior to loading the pgd and therefore starting to fill the
local TLB.

Document all the barriers that make this work correctly and add a
couple that were missing.

Cc: stable@vger.kernel.org
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/include/asm/mmu_context.h | 33 ++++++++++++++++++++++++++++++++-
 arch/x86/mm/tlb.c                  | 29 ++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..1edc9cd198b8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #endif
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
-		/* Re-load page tables */
+		/*
+		 * Re-load page tables.
+		 *
+		 * This logic has an ordering constraint:
+		 *
+		 *  CPU 0: Write to a PTE for 'next'
+		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+		 *  CPU 1: set bit 1 in next's mm_cpumask
+		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+		 *
+		 * We need to prevent an outcome in which CPU 1 observes
+		 * the new PTE value and CPU 0 observes bit 1 clear in
+		 * mm_cpumask.  (If that occurs, then the IPI will never
+		 * be sent, and CPU 0's TLB will contain a stale entry.)
+		 *
+		 * The bad outcome can occur if either CPU's load is
+		 * reordered before that CPU's store, so both CPUs much
+		 * execute full barriers to prevent this from happening.
+		 *
+		 * Thus, switch_mm needs a full barrier between the
+		 * store to mm_cpumask and any operation that could load
+		 * from next->pgd.  This barrier synchronizes with
+		 * remote TLB flushers.  Fortunately, load_cr3 is
+		 * serializing and thus acts as a full barrier.
+		 *
+		 */
 		load_cr3(next->pgd);
+
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
 		/* Stop flush ipis for the previous mm */
@@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 * schedule, protecting us from simultaneous changes.
 			 */
 			cpumask_set_cpu(cpu, mm_cpumask(next));
+
 			/*
 			 * We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
+			 *
+			 * As above, this is a barrier that forces
+			 * TLB repopulation to be ordered after the
+			 * store to mm_cpumask.
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
 	preempt_disable();
 
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+	/* This is an implicit full barrier that synchronizes with switch_mm. */
 	local_flush_tlb();
+
 	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
 
 	preempt_disable();
-	if (current->active_mm != mm)
+	if (current->active_mm != mm) {
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
+	}
 
 	if (!current->mm) {
 		leave_mm(smp_processor_id());
+
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
 	}
 
 	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
 		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
 
+	/*
+	 * Both branches below are implicit full barriers (MOV to CR or
+	 * INVLPG) that synchronize with switch_mm.
+	 */
 	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
 		base_pages_to_flush = TLB_FLUSH_ALL;
 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 	preempt_disable();
 
 	if (current->active_mm == mm) {
-		if (current->mm)
+		if (current->mm) {
+			/*
+			 * Implicit full barrier (INVLPG) that synchronizes
+			 * with switch_mm.
+			 */
 			__flush_tlb_one(start);
-		else
+		} else {
 			leave_mm(smp_processor_id());
+
+			/* Synchronize with switch_mm. */
+			smp_mb();
+		}
 	}
 
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-- 
2.5.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2016-01-08 23:15 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-08 23:15 [RFC 00/13] x86/mm: PCID and INVPCID Andy Lutomirski
2016-01-08 23:15 ` [RFC 01/13] x86/paravirt: Turn KASAN off for parvirt.o Andy Lutomirski
2016-01-10 18:59   ` Borislav Petkov
2016-01-11 12:51     ` Andrey Ryabinin
2016-01-11 12:51       ` [PATCH 1/2] x86/kasan: clear kasan_zero_page after TLB flush Andrey Ryabinin
2016-01-18 22:24         ` Andy Lutomirski
2016-01-11 12:51       ` [PATCH 2/2] x86/kasan: write protect kasan zero shadow Andrey Ryabinin
2016-01-18 22:24         ` Andy Lutomirski
2016-01-29 10:35       ` [RFC 01/13] x86/paravirt: Turn KASAN off for parvirt.o Borislav Petkov
2016-01-08 23:15 ` [RFC 02/13] x86/mm: Add INVPCID helpers Andy Lutomirski
2016-01-08 23:15 ` [RFC 03/13] x86/mm: Add a noinvpcid option to turn off INVPCID Andy Lutomirski
2016-01-08 23:15 ` [RFC 04/13] x86/mm: If INVPCID is available, use it to flush global mappings Andy Lutomirski
2016-01-08 23:15 ` Andy Lutomirski [this message]
2016-06-03 17:42   ` [RFC 05/13] x86/mm: Add barriers and document switch_mm-vs-flush synchronization Nadav Amit
2016-06-09 17:24     ` Andy Lutomirski
2016-06-09 19:45       ` Nadav Amit
2016-09-06  1:22   ` Wanpeng Li
2016-01-08 23:15 ` [RFC 06/13] x86/mm: Disable PCID on 32-bit kernels Andy Lutomirski
2016-01-08 23:15 ` [RFC 07/13] x86/mm: Add nopcid to turn off PCID Andy Lutomirski
2016-01-08 23:15 ` [RFC 08/13] x86/mm: Teach CR3 readers about PCID Andy Lutomirski
2016-01-08 23:15 ` [RFC 09/13] x86/mm: Disable interrupts when flushing the TLB using CR3 Andy Lutomirski
2016-01-08 23:41   ` Linus Torvalds
2016-01-09  0:18     ` Andy Lutomirski
2016-01-09  2:20       ` Linus Torvalds
2016-01-11 10:51         ` Ingo Molnar
2016-01-13 23:32           ` Andy Lutomirski
2016-01-13 23:35         ` Andy Lutomirski
2016-01-13 23:43           ` Dave Hansen
2016-01-13 23:51             ` Andy Lutomirski
2016-01-13 23:56               ` Dave Hansen
2016-01-14  0:34                 ` Andy Lutomirski
2016-01-08 23:15 ` [RFC 10/13] x86/mm: Factor out remote TLB flushing Andy Lutomirski
2016-01-08 23:15 ` [RFC 11/13] x86/mm: Build arch/x86/mm/tlb.c even on !SMP Andy Lutomirski
2016-01-08 23:55   ` Dave Hansen
2016-01-08 23:15 ` [RFC 12/13] x86/mm: Uninline switch_mm Andy Lutomirski
2016-01-08 23:15 ` [RFC 13/13] x86/mm: Try to preserve old TLB entries using PCID Andy Lutomirski
2016-01-09  0:27   ` Dave Hansen
2016-01-09  2:19     ` Andy Lutomirski
2016-01-08 23:31 ` [RFC 00/13] x86/mm: PCID and INVPCID Linus Torvalds
2016-01-08 23:36   ` Andy Lutomirski
2016-01-08 23:42     ` Linus Torvalds

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=95a853538da28c64dfc877c60549ec79ed7a5d69.1452294700.git.luto@kernel.org \
    --to=luto@kernel.org \
    --cc=bp@alien8.de \
    --cc=brgerst@gmail.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=oleg@redhat.com \
    --cc=stable@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox