[PATCH v8 04/15] mm: handling Non-LRU pages returned by vm_normal_pages

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Alex Sierra <alex.sierra@amd.com>
To: <jgg@nvidia.com>
Cc: <david@redhat.com>, <Felix.Kuehling@amd.com>,
	<linux-mm@kvack.org>, <rcampbell@nvidia.com>,
	<linux-ext4@vger.kernel.org>, <linux-xfs@vger.kernel.org>,
	<amd-gfx@lists.freedesktop.org>,
	<dri-devel@lists.freedesktop.org>, <hch@lst.de>,
	<jglisse@redhat.com>, <apopple@nvidia.com>, <willy@infradead.org>,
	<akpm@linux-foundation.org>
Subject: [PATCH v8 04/15] mm: handling Non-LRU pages returned by vm_normal_pages
Date: Thu, 7 Jul 2022 14:03:38 -0500	[thread overview]
Message-ID: <20220707190349.9778-5-alex.sierra@amd.com> (raw)
In-Reply-To: <20220707190349.9778-1-alex.sierra@amd.com>

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return
device-managed anonymous pages that are not LRU pages. Although they
behave like normal pages for purposes of mapping in CPU page, and for
COW. They do not support LRU lists, NUMA migration or THP.

Callers to follow_page() currently don't expect ZONE_DEVICE pages,
however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check
for ZONE_DEVICE pages in applicable users of follow_page() as well.

Signed-off-by: Alex Sierra <alex.sierra@amd.com>
Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> (v2)
Reviewed-by: Alistair Popple <apopple@nvidia.com> (v6)
---
 fs/proc/task_mmu.c |  2 +-
 mm/huge_memory.c   |  2 +-
 mm/khugepaged.c    |  9 ++++++---
 mm/ksm.c           |  6 +++---
 mm/madvise.c       |  4 ++--
 mm/memory.c        | 10 +++++++++-
 mm/mempolicy.c     |  2 +-
 mm/migrate.c       |  4 ++--
 mm/mlock.c         |  2 +-
 mm/mprotect.c      |  2 +-
 10 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2d04e3470d4c..2dd8c8a66924 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1792,7 +1792,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 		return NULL;
 
 	page = vm_normal_page(vma, addr, pte);
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		return NULL;
 
 	if (PageReserved(page))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 834f288b3769..c47e95b02244 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 
 		if (IS_ERR(page))
 			continue;
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		if (!is_transparent_hugepage(page))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..671ac7800e53 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -618,7 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 		page = vm_normal_page(vma, address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
 			result = SCAN_PAGE_NULL;
 			goto out;
 		}
@@ -1267,7 +1267,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			writable = true;
 
 		page = vm_normal_page(vma, _address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
 			result = SCAN_PAGE_NULL;
 			goto out_unmap;
 		}
@@ -1479,7 +1479,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 			goto abort;
 
 		page = vm_normal_page(vma, addr, *pte);
-
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			page = NULL;
 		/*
 		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
 		 * page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1498,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 		if (pte_none(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			goto abort;
 		page_remove_rmap(page, vma, false);
 	}
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..831b18a7a50b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 		cond_resched();
 		page = follow_page(vma, addr,
 				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-		if (IS_ERR_OR_NULL(page))
+		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
 			break;
 		if (PageKsm(page))
 			ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 		goto out;
 
 	page = follow_page(vma, addr, FOLL_GET);
-	if (IS_ERR_OR_NULL(page))
+	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
 		goto out;
 	if (PageAnon(page)) {
 		flush_anon_page(vma, page, addr);
@@ -2308,7 +2308,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
 			if (ksm_test_exit(mm))
 				break;
 			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
-			if (IS_ERR_OR_NULL(*page)) {
+			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
 				ksm_scan.address += PAGE_SIZE;
 				cond_resched();
 				continue;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0316bbc6441b..e252635fe935 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -421,7 +421,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			continue;
 
 		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		/*
@@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		}
 
 		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		/*
diff --git a/mm/memory.c b/mm/memory.c
index 7a089145cad4..4affde09cdd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -624,6 +624,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		if (is_zero_pfn(pfn))
 			return NULL;
 		if (pte_devmap(pte))
+		/*
+		 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
+		 * and will have refcounts incremented on their struct pages
+		 * when they are inserted into PTEs, thus they are safe to
+		 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
+		 * do not have refcounts. Example of legacy ZONE_DEVICE is
+		 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+		 */
 			return NULL;
 
 		print_bad_pte(vma, addr, pte, NULL);
@@ -4693,7 +4701,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	pte = pte_modify(old_pte, vma->vm_page_prot);
 
 	page = vm_normal_page(vma, vmf->address, pte);
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		goto out_map;
 
 	/* TODO: handle PTE-mapped THP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d39b01fd52fe..abc26890fc95 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -523,7 +523,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 		/*
 		 * vm_normal_page() filters out zero pages, but there might
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c1ea61f39d8..a98a219d12ab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1620,7 +1620,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 		goto out;
 
 	err = -ENOENT;
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		goto out;
 
 	err = 0;
@@ -1810,7 +1810,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 		if (IS_ERR(page))
 			goto set_status;
 
-		if (page) {
+		if (page && !is_zone_device_page(page)) {
 			err = page_to_nid(page);
 			put_page(page);
 		} else {
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..b14e929084cc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 		if (PageTransCompound(page))
 			continue;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ba5592655ee3..e034aae2a98b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -95,7 +95,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 					continue;
 
 				page = vm_normal_page(vma, addr, oldpte);
-				if (!page || PageKsm(page))
+				if (!page || is_zone_device_page(page) || PageKsm(page))
 					continue;
 
 				/* Also skip shared copy-on-write pages */
-- 
2.32.0

next prev parent reply	other threads:[~2022-07-07 19:04 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-07 19:03 [PATCH v8 00/15] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping Alex Sierra
2022-07-07 19:03 ` [PATCH v8 01/15] mm: rename is_pinnable_pages to is_longterm_pinnable_pages Alex Sierra
2022-07-08 11:26   ` David Hildenbrand
2022-07-07 19:03 ` [PATCH v8 02/15] mm: move page zone helpers into new header-specific file Alex Sierra
2022-07-08 11:28   ` David Hildenbrand
2022-07-08 21:25     ` Felix Kuehling
2022-07-11 13:56       ` David Hildenbrand
2022-07-14 16:15       ` [PATCH] mm: move page zone helpers from mm.h to mmzone.h Alex Sierra
2022-07-07 19:03 ` [PATCH v8 03/15] mm: add zone device coherent type memory support Alex Sierra
2022-07-07 19:03 ` Alex Sierra [this message]
2022-07-07 19:03 ` [PATCH v8 05/15] mm: add device coherent vma selection for memory migration Alex Sierra
2022-07-07 19:03 ` [PATCH v8 06/15] mm: remove the vma check in migrate_vma_setup() Alex Sierra
2022-07-11 13:52   ` David Hildenbrand
2022-07-14  5:31     ` Alistair Popple
2022-07-07 19:03 ` [PATCH v8 07/15] mm/gup: migrate device coherent pages when pinning instead of failing Alex Sierra
2022-07-11 13:35   ` David Hildenbrand
2022-07-11 14:00     ` Matthew Wilcox
2022-07-11 14:00       ` David Hildenbrand
2022-07-15  2:11         ` [PATCH] " Alistair Popple
2022-07-15 14:12           ` Sierra Guiza, Alejandro (Alex)
2022-07-14  5:39     ` [PATCH v8 07/15] " Alistair Popple
2022-07-07 19:03 ` [PATCH v8 08/15] drm/amdkfd: add SPM support for SVM Alex Sierra
2022-07-07 19:03 ` [PATCH v8 09/15] lib: test_hmm add ioctl to get zone device type Alex Sierra
2022-07-07 19:03 ` [PATCH v8 10/15] lib: test_hmm add module param for " Alex Sierra
2022-07-07 19:03 ` [PATCH v8 11/15] lib: add support for device coherent type in test_hmm Alex Sierra
2022-07-07 19:03 ` [PATCH v8 12/15] tools: update hmm-test to support device coherent type Alex Sierra
2022-07-07 19:03 ` [PATCH v8 13/15] tools: update test_hmm script to support SP config Alex Sierra
2022-07-07 19:03 ` [PATCH v8 14/15] tools: add hmm gup tests for device coherent type Alex Sierra
2022-07-07 19:03 ` [PATCH v8 15/15] tools: add selftests to hmm for COW in device memory Alex Sierra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220707190349.9778-5-alex.sierra@amd.com \
    --to=alex.sierra@amd.com \
    --cc=Felix.Kuehling@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=apopple@nvidia.com \
    --cc=david@redhat.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=hch@lst.de \
    --cc=jgg@nvidia.com \
    --cc=jglisse@redhat.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=rcampbell@nvidia.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox