From: "Adalbert Lazăr" <alazar@bitdefender.com>
To: linux-mm@kvack.org
Cc: linux-api@vger.kernel.org,
"Andrew Morton" <akpm@linux-foundation.org>,
"Alexander Graf" <graf@amazon.com>,
"Stefan Hajnoczi" <stefanha@redhat.com>,
"Jerome Glisse" <jglisse@redhat.com>,
"Paolo Bonzini" <pbonzini@redhat.com>,
"Mihai Donțu" <mdontu@bitdefender.com>,
"Mircea Cirjaliu" <mcirjaliu@bitdefender.com>,
"Andy Lutomirski" <luto@kernel.org>,
"Arnd Bergmann" <arnd@arndb.de>,
"Sargun Dhillon" <sargun@sargun.me>,
"Aleksa Sarai" <cyphar@cyphar.com>,
"Oleg Nesterov" <oleg@redhat.com>, "Jann Horn" <jannh@google.com>,
"Kees Cook" <keescook@chromium.org>,
"Matthew Wilcox" <willy@infradead.org>,
"Christian Brauner" <christian.brauner@ubuntu.com>,
"Adalbert Lazăr" <alazar@bitdefender.com>
Subject: [RESEND RFC PATCH 2/5] mm: let the VMA decide how zap_pte_range() acts on mapped pages
Date: Fri, 4 Sep 2020 14:31:13 +0300 [thread overview]
Message-ID: <20200904113116.20648-3-alazar@bitdefender.com> (raw)
In-Reply-To: <20200904113116.20648-1-alazar@bitdefender.com>
From: Mircea Cirjaliu <mcirjaliu@bitdefender.com>
Instead of having one big function to handle all cases of page unmapping,
have multiple implementation-defined callbacks, each for its own VMA type.
In the future, exotic VMA implementations won't have to bloat the unique
zapping function with another case of mappings.
Signed-off-by: Mircea Cirjaliu <mcirjaliu@bitdefender.com>
Signed-off-by: Adalbert Lazăr <alazar@bitdefender.com>
---
include/linux/mm.h | 16 ++++
mm/memory.c | 182 +++++++++++++++++++++++++--------------------
2 files changed, 116 insertions(+), 82 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1be4482a7b81..39e55467aa49 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -36,6 +36,7 @@ struct file_ra_state;
struct user_struct;
struct writeback_control;
struct bdi_writeback;
+struct zap_details;
void init_mm_internals(void);
@@ -601,6 +602,14 @@ struct vm_operations_struct {
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
unsigned long addr);
+
+ /*
+ * Called by zap_pte_range() for use by special VMAs that implement
+ * custom zapping behavior.
+ */
+ int (*zap_pte)(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *pte, int rss[], struct mmu_gather *tlb,
+ struct zap_details *details);
};
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
@@ -1594,6 +1603,13 @@ static inline bool can_do_mlock(void) { return false; }
extern int user_shm_lock(size_t, struct user_struct *);
extern void user_shm_unlock(size_t, struct user_struct *);
+/*
+ * Flags returned by zap_pte implementations
+ */
+#define ZAP_PTE_CONTINUE 0
+#define ZAP_PTE_FLUSH (1 << 0) /* Ask for TLB flush. */
+#define ZAP_PTE_BREAK (1 << 1) /* Break PTE iteration. */
+
/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
diff --git a/mm/memory.c b/mm/memory.c
index 8e78fb151f8f..a225bfd01417 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1031,18 +1031,109 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return ret;
}
+static int zap_pte_common(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *pte, int rss[], struct mmu_gather *tlb,
+ struct zap_details *details)
+{
+ struct mm_struct *mm = tlb->mm;
+ pte_t ptent = *pte;
+ swp_entry_t entry;
+ int flags = 0;
+
+ if (pte_present(ptent)) {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (unlikely(details) && page) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping &&
+ details->check_mapping != page_rmapping(page))
+ return 0;
+ }
+ ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ if (unlikely(!page))
+ return 0;
+
+ if (!PageAnon(page)) {
+ if (pte_dirty(ptent)) {
+ flags |= ZAP_PTE_FLUSH;
+ set_page_dirty(page);
+ }
+ if (pte_young(ptent) &&
+ likely(!(vma->vm_flags & VM_SEQ_READ)))
+ mark_page_accessed(page);
+ }
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ if (unlikely(__tlb_remove_page(tlb, page)))
+ flags |= ZAP_PTE_FLUSH | ZAP_PTE_BREAK;
+ return flags;
+ }
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ struct page *page = device_private_entry_to_page(entry);
+
+ if (unlikely(details && details->check_mapping)) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping != page_rmapping(page))
+ return 0;
+ }
+
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ put_page(page);
+ return 0;
+ }
+
+ /* If details->check_mapping, we leave swap entries. */
+ if (unlikely(details))
+ return 0;
+
+ if (!non_swap_entry(entry))
+ rss[MM_SWAPENTS]--;
+ else if (is_migration_entry(entry)) {
+ struct page *page;
+
+ page = migration_entry_to_page(entry);
+ rss[mm_counter(page)]--;
+ }
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+
+ return flags;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
- int force_flush = 0;
+ int flags = 0;
int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+
+ int (*zap_pte)(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *pte, int rss[], struct mmu_gather *tlb,
+ struct zap_details *details) = zap_pte_common;
+ if (vma->vm_ops && vma->vm_ops->zap_pte)
+ zap_pte = vma->vm_ops->zap_pte;
tlb_change_page_size(tlb, PAGE_SIZE);
again:
@@ -1058,92 +1149,19 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (!zap_is_atomic(details) && need_resched())
break;
-
- if (pte_present(ptent)) {
- struct page *page;
-
- page = vm_normal_page(vma, addr, ptent);
- if (unlikely(details) && page) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping &&
- details->check_mapping != page_rmapping(page))
- continue;
- }
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- tlb_remove_tlb_entry(tlb, pte, addr);
- if (unlikely(!page))
- continue;
-
- if (!PageAnon(page)) {
- if (pte_dirty(ptent)) {
- force_flush = 1;
- set_page_dirty(page);
- }
- if (pte_young(ptent) &&
- likely(!(vma->vm_flags & VM_SEQ_READ)))
- mark_page_accessed(page);
- }
- rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- if (unlikely(__tlb_remove_page(tlb, page))) {
- force_flush = 1;
- addr += PAGE_SIZE;
- break;
- }
- continue;
- }
-
- entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
- struct page *page = device_private_entry_to_page(entry);
-
- if (unlikely(details && details->check_mapping)) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping !=
- page_rmapping(page))
- continue;
- }
-
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
- put_page(page);
- continue;
+ if (flags & ZAP_PTE_BREAK) {
+ flags &= ~ZAP_PTE_BREAK;
+ break;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
- continue;
-
- if (!non_swap_entry(entry))
- rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
- struct page *page;
-
- page = migration_entry_to_page(entry);
- rss[mm_counter(page)]--;
- }
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ flags |= zap_pte(vma, addr, pte, rss, tlb, details);
} while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
- if (force_flush)
+ if (flags & ZAP_PTE_FLUSH)
tlb_flush_mmu_tlbonly(tlb);
pte_unmap_unlock(start_pte, ptl);
@@ -1153,8 +1171,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* entries before releasing the ptl), free the batched
* memory too. Restart if we didn't do everything.
*/
- if (force_flush) {
- force_flush = 0;
+ if (flags & ZAP_PTE_FLUSH) {
+ flags &= ~ZAP_PTE_FLUSH;
tlb_flush_mmu(tlb);
}
next prev parent reply other threads:[~2020-09-04 11:31 UTC|newest]
Thread overview: 44+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-09-04 11:31 [RESEND RFC PATCH 0/5] Remote mapping Adalbert Lazăr
2020-09-04 11:31 ` [RESEND RFC PATCH 1/5] mm: add atomic capability to zap_details Adalbert Lazăr
2020-09-04 11:31 ` Adalbert Lazăr [this message]
2020-09-04 11:31 ` [RESEND RFC PATCH 3/5] mm/mmu_notifier: remove lockdep map, allow mmu notifier to be used in nested scenarios Adalbert Lazăr
2020-09-04 12:03 ` Jason Gunthorpe
2020-09-04 11:31 ` [RESEND RFC PATCH 4/5] mm/remote_mapping: use a pidfd to access memory belonging to unrelated process Adalbert Lazăr
2020-09-04 17:55 ` Oleg Nesterov
2020-09-07 14:30 ` Oleg Nesterov
2020-09-07 15:16 ` Adalbert Lazăr
2020-09-09 8:32 ` Mircea CIRJALIU - MELIU
2020-09-10 16:43 ` Oleg Nesterov
2020-09-07 15:02 ` Christian Brauner
2020-09-07 16:04 ` Mircea CIRJALIU - MELIU
2020-09-04 11:31 ` [RESEND RFC PATCH 5/5] pidfd_mem: implemented remote memory mapping system call Adalbert Lazăr
2020-09-04 19:18 ` Florian Weimer
2020-09-07 14:55 ` Christian Brauner
2020-09-04 12:11 ` [RESEND RFC PATCH 0/5] Remote mapping Jason Gunthorpe
2020-09-04 13:24 ` Mircea CIRJALIU - MELIU
2020-09-04 13:39 ` Jason Gunthorpe
2020-09-04 14:18 ` Mircea CIRJALIU - MELIU
2020-09-04 14:39 ` Jason Gunthorpe
2020-09-04 15:40 ` Mircea CIRJALIU - MELIU
2020-09-04 16:11 ` Jason Gunthorpe
2020-09-04 19:41 ` Matthew Wilcox
2020-09-04 19:49 ` Jason Gunthorpe
2020-09-04 20:08 ` Paolo Bonzini
2020-12-01 18:01 ` Jason Gunthorpe
2020-09-04 19:19 ` Florian Weimer
2020-09-04 20:18 ` Paolo Bonzini
2020-09-07 8:33 ` Christian Brauner
2020-09-04 19:39 ` Andy Lutomirski
2020-09-04 20:09 ` Paolo Bonzini
2020-09-04 20:34 ` Andy Lutomirski
2020-09-04 21:58 ` Paolo Bonzini
2020-09-04 23:17 ` Andy Lutomirski
2020-09-05 18:27 ` Paolo Bonzini
2020-09-07 8:38 ` Christian Brauner
2020-09-07 12:41 ` Mircea CIRJALIU - MELIU
2020-09-07 7:05 ` Christoph Hellwig
2020-09-07 8:44 ` Paolo Bonzini
2020-09-07 10:25 ` Mircea CIRJALIU - MELIU
2020-09-07 15:05 ` Christian Brauner
2020-09-07 20:43 ` Andy Lutomirski
2020-09-09 11:38 ` Stefan Hajnoczi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200904113116.20648-3-alazar@bitdefender.com \
--to=alazar@bitdefender.com \
--cc=akpm@linux-foundation.org \
--cc=arnd@arndb.de \
--cc=christian.brauner@ubuntu.com \
--cc=cyphar@cyphar.com \
--cc=graf@amazon.com \
--cc=jannh@google.com \
--cc=jglisse@redhat.com \
--cc=keescook@chromium.org \
--cc=linux-api@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@kernel.org \
--cc=mcirjaliu@bitdefender.com \
--cc=mdontu@bitdefender.com \
--cc=oleg@redhat.com \
--cc=pbonzini@redhat.com \
--cc=sargun@sargun.me \
--cc=stefanha@redhat.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox