From: Dan Williams <dan.j.williams@intel.com>
To: akpm@linux-foundation.org
Cc: Andrea Arcangeli <aarcange@redhat.com>,
Dave Hansen <dave@sr71.net>,
linux-nvdimm@lists.01.org, Peter Zijlstra <peterz@infradead.org>,
x86@kernel.org, linux-mm@kvack.org,
Ingo Molnar <mingo@redhat.com>, Mel Gorman <mgorman@suse.de>,
"H. Peter Anvin" <hpa@zytor.com>,
Thomas Gleixner <tglx@linutronix.de>,
Logan Gunthorpe <logang@deltatee.com>
Subject: [-mm PATCH v2 23/25] mm, x86: get_user_pages() for dax mappings
Date: Wed, 09 Dec 2015 18:39:16 -0800 [thread overview]
Message-ID: <20151210023916.30368.94401.stgit@dwillia2-desk3.jf.intel.com> (raw)
In-Reply-To: <20151210023708.30368.92962.stgit@dwillia2-desk3.jf.intel.com>
A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver
has established a devm_memremap_pages() mapping, i.e. when the pfn_t
return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when
encountering _PAGE_DEVMAP during a page table walk we lookup and pin a
struct dev_pagemap instance to keep the result of pfn_to_page() valid
until put_page().
Cc: Dave Hansen <dave@sr71.net>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
arch/x86/mm/gup.c | 56 ++++++++++++++++++++++++++++++++++--
include/linux/huge_mm.h | 10 ++++++
include/linux/mm.h | 35 ++++++++++++++++------
mm/gup.c | 18 +++++++++++
mm/huge_memory.c | 74 +++++++++++++++++++++++++++++++++++++----------
mm/swap.c | 15 ++++++++++
6 files changed, 178 insertions(+), 30 deletions(-)
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index f8cb3e8ac250..26602434c33a 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -63,6 +63,16 @@ retry:
#endif
}
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
+
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
@@ -71,7 +81,9 @@ retry:
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
unsigned long mask;
+ int nr_start = *nr;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +101,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 0;
}
- if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ page = pte_page(pte);
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ pte_unmap(ptep);
+ return 0;
+ }
+ } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
get_page(page);
+ put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -114,6 +134,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
SetPageReferenced(page);
}
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ unsigned long pfn = pmd_pfn(pmd);
+ struct dev_pagemap *pgmap = NULL;
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
@@ -126,9 +172,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
mask |= _PAGE_RW;
if ((pmd_flags(pmd) & mask) != mask)
return 0;
+
+ VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
+ if (pmd_devmap(pmd))
+ return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
+
/* hugepages are never "special" */
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
refs = 0;
head = pmd_page(pmd);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 40c4db60c9e0..bc141a65b736 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
int prot_numa);
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
pfn_t pfn, bool write);
-
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
@@ -55,6 +54,9 @@ enum transparent_hugepage_flag {
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, int flags);
+
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
@@ -205,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
return false;
}
+
+static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd, int flags)
+{
+ return NULL;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d8d4c9ffa51f..26ce7954bad0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -460,16 +460,7 @@ static inline int page_count(struct page *page)
return atomic_read(&compound_head(page)->_count);
}
-static inline void get_page(struct page *page)
-{
- page = compound_head(page);
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
-}
+extern bool __get_page_tail(struct page *page);
static inline struct page *virt_to_head_page(const void *x)
{
@@ -762,6 +753,11 @@ struct dev_pagemap {
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap);
struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+static inline bool is_zone_device_page(const struct page *page)
+{
+ return page_zonenum(page) == ZONE_DEVICE;
+}
#else
static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
@@ -780,6 +776,11 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
{
return NULL;
}
+
+static inline bool is_zone_device_page(const struct page *page)
+{
+ return false;
+}
#endif
#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
@@ -830,6 +831,20 @@ static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
percpu_ref_put(pgmap->ref);
}
+static inline void get_page(struct page *page)
+{
+ if (is_zone_device_page(page))
+ percpu_ref_get(page->pgmap->ref);
+
+ page = compound_head(page);
+ /*
+ * Getting a normal page or the head of a compound page
+ * requires to already have an elevated page->_count.
+ */
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
+ atomic_inc(&page->_count);
+}
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
diff --git a/mm/gup.c b/mm/gup.c
index e95b0cb6ed81..60b86f2fbe95 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -98,7 +98,16 @@ retry:
}
page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
+ if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+ /*
+ * Only return device mapping pages in the FOLL_GET case since
+ * they are only valid while holding the pgmap reference.
+ */
+ if (get_dev_pagemap(pte_pfn(pte), NULL))
+ page = pte_page(pte);
+ else
+ goto no_page;
+ } else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
@@ -237,6 +246,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
+ if (pmd_devmap(*pmd)) {
+ ptl = pmd_lock(mm, pmd);
+ page = follow_devmap_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
if (likely(!pmd_trans_huge(*pmd)))
return follow_page_pte(vma, address, pmd, flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c9239e17a81c..76ccead6cf2c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1002,6 +1002,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
return VM_FAULT_NOPAGE;
}
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ pmd_t _pmd;
+
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but for now
+ * the dirty bit in the pmd is meaningless. And if the dirty
+ * bit will become meaningful and we'll only set it with
+ * FOLL_WRITE, an atomic set_bit will be required on the pmd to
+ * set the young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+}
+
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, int flags)
+{
+ unsigned long pfn = pmd_pfn(*pmd);
+ struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap;
+ struct page *page;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ return NULL;
+
+ if (pmd_present(*pmd) && pmd_devmap(*pmd))
+ /* pass */;
+ else
+ return NULL;
+
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd);
+
+ /*
+ * device mapped pages can only be returned if the
+ * caller will manage the page reference count.
+ */
+ if (!(flags & FOLL_GET))
+ return ERR_PTR(-EEXIST);
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ERR_PTR(-EFAULT);
+ page = pfn_to_page(pfn);
+ get_page(page);
+ put_dev_pagemap(pgmap);
+
+ return page;
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
@@ -1359,21 +1416,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- if (flags & FOLL_TOUCH) {
- pmd_t _pmd;
- /*
- * We should set the dirty bit only for FOLL_WRITE but
- * for now the dirty bit in the pmd is meaningless.
- * And if the dirty bit will become meaningful and
- * we'll only set it with FOLL_WRITE, an atomic
- * set_bit will be required on the pmd to set the
- * young bit, instead of the current set_pmd_at.
- */
- _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
- if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, 1))
- update_mmu_cache_pmd(vma, addr, pmd);
- }
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd);
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
diff --git a/mm/swap.c b/mm/swap.c
index 674e2c93da4e..34b0c34e231f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -90,10 +90,25 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
+static bool put_device_page(struct page *page)
+{
+ /*
+ * ZONE_DEVICE pages are never "onlined" so their reference
+ * counts never reach zero. They are always owned by a device
+ * driver, not the mm core. I.e. the page is 'idle' when the
+ * count is 1.
+ */
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) == 1, page);
+ put_dev_pagemap(page->pgmap);
+ return atomic_dec_return(&page->_count) == 1;
+}
+
void __put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
__put_compound_page(page);
+ else if (is_zone_device_page(page))
+ put_device_page(page);
else
__put_single_page(page);
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2015-12-10 2:39 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-12-10 2:37 [-mm PATCH v2 00/25] get_user_pages() for dax pte and pmd mappings Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 01/25] pmem, dax: clean up clear_pmem() Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 02/25] dax: increase granularity of dax_clear_blocks() operations Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 03/25] dax: guarantee page aligned results from bdev_direct_access() Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 04/25] dax: fix lifetime of in-kernel dax mappings with dax_map_atomic() Dan Williams
2015-12-11 18:11 ` [-mm PATCH v3 " Dan Williams
2015-12-17 22:00 ` Ross Zwisler
2015-12-17 22:16 ` Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 05/25] mm, dax: fix livelock, allow dax pmd mappings to become writeable Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 06/25] dax: Split pmd map when fallback on COW Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 07/25] um: kill pfn_t Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 08/25] kvm: rename pfn_t to kvm_pfn_t Dan Williams
2015-12-10 2:37 ` [-mm PATCH v2 09/25] mm, dax, pmem: introduce pfn_t Dan Williams
2015-12-11 18:22 ` [-mm PATCH v3 " Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 10/25] mm: introduce find_dev_pagemap() Dan Williams
2015-12-11 18:27 ` [-mm PATCH v3 " Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 11/25] x86, mm: introduce vmem_altmap to augment vmemmap_populate() Dan Williams
2015-12-15 16:50 ` Dan Williams
2015-12-15 23:28 ` Andrew Morton
2015-12-15 23:37 ` Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 12/25] libnvdimm, pfn, pmem: allocate memmap array in persistent memory Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 13/25] avr32: convert to asm-generic/memory_model.h Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 14/25] hugetlb: fix compile error on tile Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 15/25] frv: fix compiler warning from definition of __pmd() Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 16/25] x86, mm: introduce _PAGE_DEVMAP Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 17/25] mm, dax, gpu: convert vm_insert_mixed to pfn_t Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 18/25] mm, dax: convert vmf_insert_pfn_pmd() " Dan Williams
2015-12-10 2:38 ` [-mm PATCH v2 19/25] list: introduce list_del_poison() Dan Williams
2015-12-15 23:41 ` Andrew Morton
2015-12-16 0:17 ` Dan Williams
2015-12-10 2:39 ` [-mm PATCH v2 20/25] libnvdimm, pmem: move request_queue allocation earlier in probe Dan Williams
2015-12-10 2:39 ` [-mm PATCH v2 21/25] mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup Dan Williams
2015-12-15 23:46 ` Andrew Morton
2015-12-10 2:39 ` [-mm PATCH v2 22/25] mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd Dan Williams
2015-12-10 2:39 ` Dan Williams [this message]
2015-12-16 0:14 ` [-mm PATCH v2 23/25] mm, x86: get_user_pages() for dax mappings Andrew Morton
2015-12-16 2:18 ` Dan Williams
2015-12-18 0:09 ` Dan Williams
2015-12-10 2:39 ` [-mm PATCH v2 24/25] dax: provide diagnostics for pmd mapping failures Dan Williams
2015-12-10 2:39 ` [-mm PATCH v2 25/25] dax: re-enable dax pmd mappings Dan Williams
2015-12-10 18:08 ` [-mm PATCH v2 00/25] get_user_pages() for dax pte and " Jeff Moyer
2015-12-10 18:56 ` Dan Williams
2015-12-10 19:20 ` Jeff Moyer
2015-12-11 2:03 ` Dan Williams
2015-12-14 14:52 ` Jeff Moyer
2015-12-14 16:44 ` Dan Williams
2015-12-11 18:44 ` Dan Williams
2015-12-15 1:59 ` Dan Williams
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20151210023916.30368.94401.stgit@dwillia2-desk3.jf.intel.com \
--to=dan.j.williams@intel.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=dave@sr71.net \
--cc=hpa@zytor.com \
--cc=linux-mm@kvack.org \
--cc=linux-nvdimm@lists.01.org \
--cc=logang@deltatee.com \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox