linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Avi Kivity <avi@qumranet.com>
To: lkml@qumranet.com, linux-mm@kvack.org
Cc: shaohua.li@intel.com, kvm@qumranet.com,
	general@lists.openfabrics.org, Avi Kivity <avi@qumranet.com>
Subject: [PATCH][RFC]: pte notifiers -- support for external page tables
Date: Wed,  5 Sep 2007 19:38:48 +0300	[thread overview]
Message-ID: <11890103283456-git-send-email-avi@qumranet.com> (raw)

Some hardware and software systems maintain page tables outside the normal
Linux page tables, which reference userspace memory.  This includes
Infiniband, other RDMA-capable devices, and kvm (with a pending patch).

Because these systems maintain external page tables (and external tlbs),
Linux cannot demand page this memory and it must be locked.  For kvm at
least, this is a significant reduction in functionality.

This sample patch adds a new mechanism, pte notifiers, that allows drivers
to register an interest in a changes to ptes. Whenever Linux changes a
pte, it will call a notifier to allow the driver to adjust the external
page table and flush its tlb.

Note that only one notifier is implemented, ->clear(), but others should be
similar.

pte notifiers are different from paravirt_ops: they extend the normal
page tables rather than replace them; and they provide high-level information
such as the vma and the virtual address for the driver to use.

Signed-off-by: Avi Kivity <avi@qumranet.com>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 655094d..5d2bbee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
 #include <linux/debug_locks.h>
 #include <linux/backing-dev.h>
 #include <linux/mm_types.h>
+#include <linux/pte_notifier.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -108,6 +109,9 @@ struct vm_area_struct {
 #ifndef CONFIG_MMU
 	atomic_t vm_usage;		/* refcount (VMAs shared if !MMU) */
 #endif
+#ifdef CONFIG_PTE_NOTIFIERS
+	struct list_head pte_notifier_list;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
diff --git a/include/linux/pte_notifier.h b/include/linux/pte_notifier.h
new file mode 100644
index 0000000..d28832b
--- /dev/null
+++ b/include/linux/pte_notifier.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_PTE_NOTIFIER_H
+#define _LINUX_PTE_NOTIFIER_H
+
+#include <linux/list.h>
+
+struct vm_area_struct;
+
+#ifdef CONFIG_PTE_NOTIFIERS
+
+struct pte_notifier;
+
+struct pte_notifier_ops {
+	void (*close)(struct pte_notifier *pn, struct vm_area_struct *vma);
+	void (*clear)(struct pte_notifier *pn, struct vm_area_struct *vma,
+		      unsigned long address);
+};
+
+struct pte_notifier {
+	struct list_head link;
+	const struct pte_notifier_ops *ops;
+};
+
+
+void vma_init_pte_notifiers(struct vm_area_struct *vma);
+void vma_close_pte_notifiers(struct vm_area_struct *vma);
+void pte_notifier_register(struct pte_notifier *pn,
+			   struct vm_area_struct *vma);
+void pte_notifier_unregister(struct pte_notifier *pn);
+
+#define pte_notifier_call(vma, function, args...)			\
+	do {								\
+		struct pte_notifier *__pn;				\
+									\
+		list_for_each_entry(__pn, &vma->pte_notifier_list, link) \
+			__pn->ops->function(__pn, vma, args);		\
+	} while (0)
+
+#else
+
+static inline void vma_init_pte_notifiers(struct vm_area_struct *vma) {}
+static inline void vma_close_pte_notifiers(struct vm_area_struct *vma) {}
+static inline void pte_notifier_register(struct pte_notifier *pn,
+					 struct vm_area_struct *vma) {}
+static inline void pte_notifier_unregister(struct pte_notifier *pn) {}
+
+#define pte_notifier_call(vma, function, args...) \
+	do { } while (0)
+
+#endif
+
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index e24d348..7b10151 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -176,3 +176,6 @@ config NR_QUICK
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config PTE_NOTIFIERS
+       bool
diff --git a/mm/Makefile b/mm/Makefile
index 245e33a..59f6a03 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,4 +29,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_PTE_NOTIFIERS) += pte_notifiers.o
 
diff --git a/mm/mmap.c b/mm/mmap.c
index b653721..cc6c4fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1134,6 +1134,7 @@ munmap_back:
 	vma->vm_page_prot = protection_map[vm_flags &
 				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma->vm_pgoff = pgoff;
+	vma_init_pte_notifiers(vma);
 
 	if (file) {
 		error = -EINVAL;
diff --git a/mm/pte_notifier.c b/mm/pte_notifier.c
new file mode 100644
index 0000000..0b9076c
--- /dev/null
+++ b/mm/pte_notifier.c
@@ -0,0 +1,32 @@
+
+#include <linux/pte_notifier.h>
+
+void vma_init_pte_notifiers(struct vm_area_struct *vma)
+{
+	INIT_LIST_HEAD(&vma->pte_notifier_list);
+}
+EXPORT_SYMBOL_GPL(vma_init_pte_notifiers);
+
+void vma_destroy_pte_notifiers(struct vm_area_struct *vma)
+{
+	struct pte_notifier *pn;
+	struct list_head *n;
+
+	list_for_each_entry_safe(pn, n, &vma->pte_notifier_list, link) {
+		pn->ops->close(__pn, vma);
+		__list_del(n);
+	}
+}
+
+void pte_notifier_register(struct pte_notifier *pn, struct vm_area_struct *vma)
+{
+	list_add(&pn->link, &vma->pte_notifier_list);
+}
+EXPORT_SYMBOL_GPL(pte_notifier_register);
+
+void pte_notifier_unregister(struct pte_notifier *pn)
+{
+	list_del(&pn->link);
+}
+EXPORT_SYMBOL_GPL(pte_notifier_unregister);
+
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac397..3f61d38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -682,6 +682,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	}
 
 	/* Nuke the page table entry. */
+	pte_notifier_call(vma, clear, address);
 	flush_cache_page(vma, address, page_to_pfn(page));
 	pteval = ptep_clear_flush(vma, address, pte);
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

             reply	other threads:[~2007-09-05 16:38 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-09-05 16:38 Avi Kivity [this message]
2007-09-05 19:05 ` Rik van Riel
2007-09-05 19:14   ` Avi Kivity
2007-09-05 19:23     ` Rik van Riel
2007-09-05 19:32       ` Avi Kivity
2007-09-06 11:28         ` Jeremy Fitzhardinge
2007-09-05 20:40 ` Jack Steiner
2007-09-05 20:40   ` Avi Kivity
2007-09-05 20:42   ` Avi Kivity
2007-09-06  6:24 ` [ofa-general] " Gleb Natapov
2007-09-06  8:35   ` Avi Kivity
2007-09-06  8:41     ` Gleb Natapov
2007-09-10 18:17     ` Andrew Hastings
2007-09-11 10:37       ` Daniel J Blueman
2007-09-11 11:19         ` Gleb Natapov
2007-09-05 19:32 [PATCH][RFC] " Avi Kivity
2007-09-06  4:28 ` Shaohua Li
2007-09-06  8:38   ` Avi Kivity
2007-09-06 13:28 ` Andi Kleen
     [not found] ` <p73myw09g5w.fsf@bingen.suse.de>
2007-09-06 15:17   ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=11890103283456-git-send-email-avi@qumranet.com \
    --to=avi@qumranet.com \
    --cc=general@lists.openfabrics.org \
    --cc=kvm@qumranet.com \
    --cc=linux-mm@kvack.org \
    --cc=lkml@qumranet.com \
    --cc=shaohua.li@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox