[rfc] lockless get_user_pages for dio (and more)

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Nick Piggin <nickpiggin@yahoo.com.au>
To: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: Ken Chen <kenchen@google.com>,
	Badari Pulavarty <pbadari@gmail.com>,
	linux-mm <linux-mm@kvack.org>,
	tony.luck@intel.com
Subject: [rfc] lockless get_user_pages for dio (and more)
Date: Sun, 14 Oct 2007 11:01:02 +1000	[thread overview]
Message-ID: <200710141101.02649.nickpiggin@yahoo.com.au> (raw)
In-Reply-To: <200710140927.46478.nickpiggin@yahoo.com.au>

[-- Attachment #1: Type: text/plain, Size: 629 bytes --]

On Sunday 14 October 2007 09:27, Nick Piggin wrote:
> On Saturday 13 October 2007 06:34, Siddha, Suresh B wrote:

> > sounds like two birds in one shot, I think.
>
> OK, I'll flesh it out a bit more and see if I can actually get
> something working (and working with hugepages too).

This is just a really quick hack, untested ATM, but one that
has at least a chance of working (on x86).

I don't know if I've got the hugepage walk exactly right,
because I've never really done much practical work on that
side of things.

Hmm, I guess we also want some instrumentation to ensure that
we aren't often dropping into the slowpath.

[-- Attachment #2: mm-get_user_pages-fast.patch --]
[-- Type: text/x-diff, Size: 9917 bytes --]

Introduce a new "fast_gup" (for want of a better name right now) which
is basically a get_user_pages with a less general API that is more suited
to the common case.

- task and mm are always current and current->mm
- force is always 0
- pages is always non-NULL
- don't pass back vmas

This allows (at least on x86), an optimistic lockless pagetable walk,
without taking any page table locks or even mmap_sem. Page table existence
is guaranteed by turning interrupts off (combined with the fact that we're
always looking up the current mm, which would need an IPI before its
pagetables could be shot down from another CPU).

Many other architectures could do the same thing. Those that don't IPI
could potentially RCU free the page tables and do speculative references
on the pages (a la lockless pagecache) to achieve a lockless fast_gup.


---
Index: linux-2.6/arch/x86/lib/Makefile_64
===================================================================
--- linux-2.6.orig/arch/x86/lib/Makefile_64
+++ linux-2.6/arch/x86/lib/Makefile_64
@@ -10,4 +10,4 @@ obj-$(CONFIG_SMP)	+= msr-on-cpu.o
 lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
 	usercopy_64.o getuser_64.o putuser_64.o  \
 	thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
-lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o gup.o
Index: linux-2.6/arch/x86/lib/gup.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86/lib/gup.c
@@ -0,0 +1,144 @@
+/*
+ * Lockless fast_gup for x86
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t *ptep;
+
+	ptep = (pte_t *)pmd_page_vaddr(pmd) + pte_index(addr);
+	do {
+		pte_t pte = *ptep;
+		struct page *page;
+
+		if (pte_none(pte) || !pte_present(pte))
+			return 0;
+
+		if (write && !pte_write(pte))
+			return 0;
+
+		page = pte_page(pte);
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep);
+
+	return 1;
+}
+
+static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	pte_t pte = *(pte_t *)&pmd;
+
+	if (write && !pte_write(pte))
+		return 0;
+
+	do {
+		unsigned long pfn_offset;
+		struct page *page;
+
+		pfn_offset = (addr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		page = pte_page(pte) + pfn_offset;
+		get_page(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (addr += PAGE_SIZE, addr != end);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = (pmd_t *)pud_page_vaddr(pud) + pmd_index(addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(pmd))
+			return 0;
+		if (pmd_large(pmd))
+			gup_huge_pmd(pmd, addr, next, write, pages, nr);
+		else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = (pud_t *)pgd_page_vaddr(pgd) + pud_index(addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+			return 0;
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+int fast_gup(unsigned long start, unsigned long end, int write,
+		struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = start;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	/* XXX: batch / limit 'nr', to avoid huge latency */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables from being freed on x86-64. XXX: hugepages!
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86-64), we can follow the address down to the
+	 * the page.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+slow:
+	{
+		int ret;
+		down_read(&mm->mmap_sem);
+		ret = get_user_pages(current, mm, start, (end - start) >> PAGE_SHIFT,
+					write, 0, pages, NULL);
+		up_read(&mm->mmap_sem);
+		return ret;
+	}
+}
Index: linux-2.6/include/asm-x86/uaccess_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/uaccess_64.h
+++ linux-2.6/include/asm-x86/uaccess_64.h
@@ -381,4 +381,8 @@ static inline int __copy_from_user_inato
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+struct page;
+int fast_gup(unsigned long start, unsigned long end, int write,
+		struct page **pages);
+
 #endif /* __X86_64_UACCESS_H */
Index: linux-2.6/fs/bio.c
===================================================================
--- linux-2.6.orig/fs/bio.c
+++ linux-2.6/fs/bio.c
@@ -646,12 +646,8 @@ static struct bio *__bio_map_user_iov(st
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
-
+		ret = fast_gup(uaddr, local_nr_pages << PAGE_SHIFT, write_to_vm,
+				&pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
Index: linux-2.6/fs/block_dev.c
===================================================================
--- linux-2.6.orig/fs/block_dev.c
+++ linux-2.6/fs/block_dev.c
@@ -221,10 +221,8 @@ static struct page *blk_get_page(unsigne
 	if (pvec->idx == pvec->nr) {
 		nr_pages = PAGES_SPANNED(addr, count);
 		nr_pages = min(nr_pages, VEC_SIZE);
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, addr, nr_pages,
-				     rw == READ, 0, pvec->page, NULL);
-		up_read(&current->mm->mmap_sem);
+		ret = fast_gup(addr, nr_pages << PAGE_SHIFT, rw == READ,
+				pvec->page);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		pvec->nr = ret;
Index: linux-2.6/fs/direct-io.c
===================================================================
--- linux-2.6.orig/fs/direct-io.c
+++ linux-2.6/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = fast_gup(
 		dio->curr_user_address,		/* Where from? */
-		nr_pages,			/* How many pages? */
+		nr_pages << PAGE_SHIFT,		/* Where to? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
Index: linux-2.6/fs/splice.c
===================================================================
--- linux-2.6.orig/fs/splice.c
+++ linux-2.6/fs/splice.c
@@ -1224,33 +1224,6 @@ static long do_splice(struct file *in, l
 }
 
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
-/*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
@@ -1263,8 +1236,6 @@ static int get_iovec_page_array(const st
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1273,7 +1244,7 @@ static int get_iovec_page_array(const st
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1307,9 +1278,8 @@ static int get_iovec_page_array(const st
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = fast_gup((unsigned long) base, npages << PAGE_SHIFT, 0,
+				       &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1348,8 +1318,6 @@ static int get_iovec_page_array(const st
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;

next prev parent reply	other threads:[~2007-10-14  1:01 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-08 22:52 [rfc] more granular page table lock for hugepages Siddha, Suresh B
2007-10-09 20:23 ` Ken Chen
2007-10-09 21:05   ` Badari Pulavarty
2007-10-10  0:15     ` Siddha, Suresh B
2007-10-10  6:10       ` Ken Chen
2007-10-10  7:50         ` Ken Chen
2007-10-11 11:39           ` Nick Piggin
2007-10-12 20:34             ` Siddha, Suresh B
2007-10-13 23:27               ` Nick Piggin
2007-10-14  1:01                 ` Nick Piggin [this message]
2007-10-14 18:19                   ` [rfc] lockless get_user_pages for dio (and more) Siddha, Suresh B
2007-10-15  4:15                     ` Nick Piggin
2007-10-15 12:25                     ` Nick Piggin
2007-10-15 17:03                       ` Badari Pulavarty
2007-10-15 17:49                         ` Siddha, Suresh B
2007-10-15 17:54                       ` Siddha, Suresh B
2007-10-15 20:21                       ` Ken Chen
2007-10-16  2:15                         ` Nick Piggin
2007-10-16  0:14                           ` Dave Hansen
2007-10-16  3:26                             ` Nick Piggin
2007-10-16  3:32                           ` Nick Piggin
2007-12-10 21:30                       ` Dave Kleikamp
2007-12-12  4:57                         ` Nick Piggin
2007-12-12  5:11                           ` Dave Kleikamp
2007-12-12  5:40                             ` Nick Piggin
2008-01-16 19:58                               ` Dave Kleikamp
2008-01-17  6:34                                 ` Nick Piggin
2008-01-24  7:06                                 ` Nick Piggin
2007-10-14 15:42                 ` [rfc] more granular page table lock for hugepages Siddha, Suresh B
2007-10-15  4:17                   ` Nick Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200710141101.02649.nickpiggin@yahoo.com.au \
    --to=nickpiggin@yahoo.com.au \
    --cc=kenchen@google.com \
    --cc=linux-mm@kvack.org \
    --cc=pbadari@gmail.com \
    --cc=suresh.b.siddha@intel.com \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox