* [PATCH][RFC]: pte notifiers -- support for external page tables
@ 2007-09-05 16:38 Avi Kivity
2007-09-05 19:05 ` Rik van Riel
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-05 16:38 UTC (permalink / raw)
To: lkml, linux-mm; +Cc: shaohua.li, kvm, general, Avi Kivity
Some hardware and software systems maintain page tables outside the normal
Linux page tables, which reference userspace memory. This includes
Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
Because these systems maintain external page tables (and external tlbs),
Linux cannot demand page this memory and it must be locked. For kvm at
least, this is a significant reduction in functionality.
This sample patch adds a new mechanism, pte notifiers, that allows drivers
to register an interest in a changes to ptes. Whenever Linux changes a
pte, it will call a notifier to allow the driver to adjust the external
page table and flush its tlb.
Note that only one notifier is implemented, ->clear(), but others should be
similar.
pte notifiers are different from paravirt_ops: they extend the normal
page tables rather than replace them; and they provide high-level information
such as the vma and the virtual address for the driver to use.
Signed-off-by: Avi Kivity <avi@qumranet.com>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 655094d..5d2bbee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
#include <linux/debug_locks.h>
#include <linux/backing-dev.h>
#include <linux/mm_types.h>
+#include <linux/pte_notifier.h>
struct mempolicy;
struct anon_vma;
@@ -108,6 +109,9 @@ struct vm_area_struct {
#ifndef CONFIG_MMU
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
#endif
+#ifdef CONFIG_PTE_NOTIFIERS
+ struct list_head pte_notifier_list;
+#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
diff --git a/include/linux/pte_notifier.h b/include/linux/pte_notifier.h
new file mode 100644
index 0000000..d28832b
--- /dev/null
+++ b/include/linux/pte_notifier.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_PTE_NOTIFIER_H
+#define _LINUX_PTE_NOTIFIER_H
+
+#include <linux/list.h>
+
+struct vm_area_struct;
+
+#ifdef CONFIG_PTE_NOTIFIERS
+
+struct pte_notifier;
+
+struct pte_notifier_ops {
+ void (*close)(struct pte_notifier *pn, struct vm_area_struct *vma);
+ void (*clear)(struct pte_notifier *pn, struct vm_area_struct *vma,
+ unsigned long address);
+};
+
+struct pte_notifier {
+ struct list_head link;
+ const struct pte_notifier_ops *ops;
+};
+
+
+void vma_init_pte_notifiers(struct vm_area_struct *vma);
+void vma_close_pte_notifiers(struct vm_area_struct *vma);
+void pte_notifier_register(struct pte_notifier *pn,
+ struct vm_area_struct *vma);
+void pte_notifier_unregister(struct pte_notifier *pn);
+
+#define pte_notifier_call(vma, function, args...) \
+ do { \
+ struct pte_notifier *__pn; \
+ \
+ list_for_each_entry(__pn, &vma->pte_notifier_list, link) \
+ __pn->ops->function(__pn, vma, args); \
+ } while (0)
+
+#else
+
+static inline void vma_init_pte_notifiers(struct vm_area_struct *vma) {}
+static inline void vma_close_pte_notifiers(struct vm_area_struct *vma) {}
+static inline void pte_notifier_register(struct pte_notifier *pn,
+ struct vm_area_struct *vma) {}
+static inline void pte_notifier_unregister(struct pte_notifier *pn) {}
+
+#define pte_notifier_call(vma, function, args...) \
+ do { } while (0)
+
+#endif
+
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index e24d348..7b10151 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -176,3 +176,6 @@ config NR_QUICK
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config PTE_NOTIFIERS
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index 245e33a..59f6a03 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,4 +29,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_PTE_NOTIFIERS) += pte_notifiers.o
diff --git a/mm/mmap.c b/mm/mmap.c
index b653721..cc6c4fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1134,6 +1134,7 @@ munmap_back:
vma->vm_page_prot = protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma->vm_pgoff = pgoff;
+ vma_init_pte_notifiers(vma);
if (file) {
error = -EINVAL;
diff --git a/mm/pte_notifier.c b/mm/pte_notifier.c
new file mode 100644
index 0000000..0b9076c
--- /dev/null
+++ b/mm/pte_notifier.c
@@ -0,0 +1,32 @@
+
+#include <linux/pte_notifier.h>
+
+void vma_init_pte_notifiers(struct vm_area_struct *vma)
+{
+ INIT_LIST_HEAD(&vma->pte_notifier_list);
+}
+EXPORT_SYMBOL_GPL(vma_init_pte_notifiers);
+
+void vma_destroy_pte_notifiers(struct vm_area_struct *vma)
+{
+ struct pte_notifier *pn;
+ struct list_head *n;
+
+ list_for_each_entry_safe(pn, n, &vma->pte_notifier_list, link) {
+ pn->ops->close(__pn, vma);
+ __list_del(n);
+ }
+}
+
+void pte_notifier_register(struct pte_notifier *pn, struct vm_area_struct *vma)
+{
+ list_add(&pn->link, &vma->pte_notifier_list);
+}
+EXPORT_SYMBOL_GPL(pte_notifier_register);
+
+void pte_notifier_unregister(struct pte_notifier *pn)
+{
+ list_del(&pn->link);
+}
+EXPORT_SYMBOL_GPL(pte_notifier_unregister);
+
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac397..3f61d38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -682,6 +682,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/* Nuke the page table entry. */
+ pte_notifier_call(vma, clear, address);
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 16:38 [PATCH][RFC]: pte notifiers -- support for external page tables Avi Kivity
@ 2007-09-05 19:05 ` Rik van Riel
2007-09-05 19:14 ` Avi Kivity
2007-09-05 20:40 ` Jack Steiner
2007-09-06 6:24 ` [ofa-general] " Gleb Natapov
2 siblings, 1 reply; 20+ messages in thread
From: Rik van Riel @ 2007-09-05 19:05 UTC (permalink / raw)
To: Avi Kivity; +Cc: lkml, linux-mm, shaohua.li, kvm, general
Avi Kivity wrote:
> This sample patch adds a new mechanism, pte notifiers, that allows drivers
> to register an interest in a changes to ptes. Whenever Linux changes a
> pte, it will call a notifier to allow the driver to adjust the external
> page table and flush its tlb.
>
> Note that only one notifier is implemented, ->clear(), but others should be
> similar.
This approach makes a lot of sense.
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 41ac397..3f61d38 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -682,6 +682,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
> }
>
> /* Nuke the page table entry. */
> + pte_notifier_call(vma, clear, address);
> flush_cache_page(vma, address, page_to_pfn(page));
> pteval = ptep_clear_flush(vma, address, pte);
If you want this to be useful to Infiniband, you should probably
also hook up do_wp_page() in mm/memory.c, where a page table can
be pointed to another page.
Probably the code in mm/mremap.c will need to be hooked up too.
--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is. Each group
calls the other unpatriotic.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 19:05 ` Rik van Riel
@ 2007-09-05 19:14 ` Avi Kivity
2007-09-05 19:23 ` Rik van Riel
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2007-09-05 19:14 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-mm, shaohua.li, kvm, general
Rik van Riel wrote:
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 41ac397..3f61d38 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -682,6 +682,7 @@ static int try_to_unmap_one(struct page *page,
>> struct vm_area_struct *vma,
>> }
>>
>> /* Nuke the page table entry. */
>> + pte_notifier_call(vma, clear, address);
>> flush_cache_page(vma, address, page_to_pfn(page));
>> pteval = ptep_clear_flush(vma, address, pte);
>
> If you want this to be useful to Infiniband, you should probably
> also hook up do_wp_page() in mm/memory.c, where a page table can
> be pointed to another page.
>
> Probably the code in mm/mremap.c will need to be hooked up too.
>
I imagine that many of the paravirt_ops mmu hooks will need to be
exposed as pte notifiers. This can't be done as part of the
paravirt_ops code due to the need to pass high level data structures,
though.
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 19:14 ` Avi Kivity
@ 2007-09-05 19:23 ` Rik van Riel
2007-09-05 19:32 ` Avi Kivity
0 siblings, 1 reply; 20+ messages in thread
From: Rik van Riel @ 2007-09-05 19:23 UTC (permalink / raw)
To: Avi Kivity; +Cc: linux-mm, shaohua.li, kvm, general
Avi Kivity wrote:
> Rik van Riel wrote:
>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 41ac397..3f61d38 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -682,6 +682,7 @@ static int try_to_unmap_one(struct page *page,
>>> struct vm_area_struct *vma,
>>> }
>>>
>>> /* Nuke the page table entry. */
>>> + pte_notifier_call(vma, clear, address);
>>> flush_cache_page(vma, address, page_to_pfn(page));
>>> pteval = ptep_clear_flush(vma, address, pte);
>>
>> If you want this to be useful to Infiniband, you should probably
>> also hook up do_wp_page() in mm/memory.c, where a page table can
>> be pointed to another page.
>>
>> Probably the code in mm/mremap.c will need to be hooked up too.
>>
>
> I imagine that many of the paravirt_ops mmu hooks will need to be
> exposed as pte notifiers. This can't be done as part of the
> paravirt_ops code due to the need to pass high level data structures,
> though.
Wait, I thought that paravirt_ops was all on the side of the
guest kernel, where these host kernel operations are invisible?
--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is. Each group
calls the other unpatriotic.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 19:23 ` Rik van Riel
@ 2007-09-05 19:32 ` Avi Kivity
2007-09-06 11:28 ` Jeremy Fitzhardinge
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2007-09-05 19:32 UTC (permalink / raw)
To: Rik van Riel; +Cc: linux-mm, shaohua.li, kvm-devel, general, linux-kernel
Rik van Riel wrote:
>>
>> I imagine that many of the paravirt_ops mmu hooks will need to be
>> exposed as pte notifiers. This can't be done as part of the
>> paravirt_ops code due to the need to pass high level data structures,
>> though.
>
> Wait, I thought that paravirt_ops was all on the side of the
> guest kernel, where these host kernel operations are invisible?
>
It is, but the hooks are in much the same places. It could be argued
that you'd embed pte notifiers in paravirt_ops for a host kernel, but
that's not doable because pte notifiers use higher-level data strutures
(like vmas).
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 16:38 [PATCH][RFC]: pte notifiers -- support for external page tables Avi Kivity
2007-09-05 19:05 ` Rik van Riel
@ 2007-09-05 20:40 ` Jack Steiner
2007-09-05 20:40 ` Avi Kivity
2007-09-05 20:42 ` Avi Kivity
2007-09-06 6:24 ` [ofa-general] " Gleb Natapov
2 siblings, 2 replies; 20+ messages in thread
From: Jack Steiner @ 2007-09-05 20:40 UTC (permalink / raw)
To: Avi Kivity; +Cc: lkml, linux-mm, shaohua.li, kvm, general
On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
> Some hardware and software systems maintain page tables outside the normal
> Linux page tables, which reference userspace memory. This includes
> Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
>
I like it.
We have 2 special devices with external TLBs that can
take advantage of this.
One suggestion - at least for what we need. Can the notifier be
registered against the mm_struct instead of (or in addition to) the
vma?
---jack
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 20:40 ` Jack Steiner
@ 2007-09-05 20:40 ` Avi Kivity
2007-09-05 20:42 ` Avi Kivity
1 sibling, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-05 20:40 UTC (permalink / raw)
To: Jack Steiner; +Cc: lkml, linux-mm, shaohua.li, kvm, general
Jack Steiner wrote:
> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>
>> Some hardware and software systems maintain page tables outside the normal
>> Linux page tables, which reference userspace memory. This includes
>> Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
>>
>>
>
> I like it.
>
> We have 2 special devices with external TLBs that can
> take advantage of this.
>
> One suggestion - at least for what we need. Can the notifier be
> registered against the mm_struct instead of (or in addition to) the
> vma?
>
Yes. It's a lot simpler since this way we don't have to support vma
creation/splitting/merging/destruction. There's a tiny performance hit
for kvm, but it isn't worth the bother.
Will implement for v2 of this patch.
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 20:40 ` Jack Steiner
2007-09-05 20:40 ` Avi Kivity
@ 2007-09-05 20:42 ` Avi Kivity
1 sibling, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-05 20:42 UTC (permalink / raw)
To: Jack Steiner; +Cc: linux-kernel, linux-mm, shaohua.li, kvm-devel, general
[resend due to broken cc list in my original post]
Jack Steiner wrote:
> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>
>> Some hardware and software systems maintain page tables outside the normal
>> Linux page tables, which reference userspace memory. This includes
>> Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
>>
>>
>
> I like it.
>
> We have 2 special devices with external TLBs that can
> take advantage of this.
>
> One suggestion - at least for what we need. Can the notifier be
> registered against the mm_struct instead of (or in addition to) the
> vma?
>
Yes. It's a lot simpler since this way we don't have to support vma
creation/splitting/merging/destruction. There's a tiny performance hit
for kvm, but it isn't worth the bother.
Will implement for v2 of this patch.
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [ofa-general] [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 16:38 [PATCH][RFC]: pte notifiers -- support for external page tables Avi Kivity
2007-09-05 19:05 ` Rik van Riel
2007-09-05 20:40 ` Jack Steiner
@ 2007-09-06 6:24 ` Gleb Natapov
2007-09-06 8:35 ` Avi Kivity
2 siblings, 1 reply; 20+ messages in thread
From: Gleb Natapov @ 2007-09-06 6:24 UTC (permalink / raw)
To: Avi Kivity; +Cc: lkml, linux-mm, kvm, shaohua.li, general, addy
On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
> This sample patch adds a new mechanism, pte notifiers, that allows drivers
> to register an interest in a changes to ptes. Whenever Linux changes a
> pte, it will call a notifier to allow the driver to adjust the external
> page table and flush its tlb.
How is this different from http://lwn.net/Articles/133627/? AFAIR the
patch was rejected because there was only one user for it and it was
decided that it would be better to maintain it out of tree for a while.
--
Gleb.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [ofa-general] [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-06 6:24 ` [ofa-general] " Gleb Natapov
@ 2007-09-06 8:35 ` Avi Kivity
2007-09-06 8:41 ` Gleb Natapov
2007-09-10 18:17 ` Andrew Hastings
0 siblings, 2 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-06 8:35 UTC (permalink / raw)
To: Gleb Natapov; +Cc: lkml, linux-mm, kvm, shaohua.li, general, addy
Gleb Natapov wrote:
> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>
>> This sample patch adds a new mechanism, pte notifiers, that allows drivers
>> to register an interest in a changes to ptes. Whenever Linux changes a
>> pte, it will call a notifier to allow the driver to adjust the external
>> page table and flush its tlb.
>>
> How is this different from http://lwn.net/Articles/133627/? AFAIR the
> patch was rejected because there was only one user for it and it was
> decided that it would be better to maintain it out of tree for a while.
>
Your patch is more complete.
There are now at least three users: you, kvm, and newer Infiniband
HCAs. Care to resurrect the patch?
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [ofa-general] [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-06 8:35 ` Avi Kivity
@ 2007-09-06 8:41 ` Gleb Natapov
2007-09-10 18:17 ` Andrew Hastings
1 sibling, 0 replies; 20+ messages in thread
From: Gleb Natapov @ 2007-09-06 8:41 UTC (permalink / raw)
To: Avi Kivity; +Cc: lkml, linux-mm, kvm, shaohua.li, general, addy
On Thu, Sep 06, 2007 at 11:35:24AM +0300, Avi Kivity wrote:
> Gleb Natapov wrote:
>> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>>
>>> This sample patch adds a new mechanism, pte notifiers, that allows
>>> drivers
>>> to register an interest in a changes to ptes. Whenever Linux changes a
>>> pte, it will call a notifier to allow the driver to adjust the external
>>> page table and flush its tlb.
>>>
>> How is this different from http://lwn.net/Articles/133627/? AFAIR the
>> patch was rejected because there was only one user for it and it was
>> decided that it would be better to maintain it out of tree for a while.
>>
>
> Your patch is more complete.
>
> There are now at least three users: you, kvm, and newer Infiniband HCAs.
> Care to resurrect the patch?
>
This is not my patch :) This is patch written by David Addison from
Quadrics. I CCed him on my previous email. I just saw that you are
trying to do something similar.
--
Gleb.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-05 19:32 ` Avi Kivity
@ 2007-09-06 11:28 ` Jeremy Fitzhardinge
0 siblings, 0 replies; 20+ messages in thread
From: Jeremy Fitzhardinge @ 2007-09-06 11:28 UTC (permalink / raw)
To: Avi Kivity
Cc: Rik van Riel, linux-mm, shaohua.li, kvm-devel, general, linux-kernel
Avi Kivity wrote:
> It is, but the hooks are in much the same places. It could be argued
> that you'd embed pte notifiers in paravirt_ops for a host kernel, but
> that's not doable because pte notifiers use higher-level data
> strutures (like vmas).
Also, I wouldn't like to preclude the possibility of having a kernel
that's both a guest and a host (ie, nested vmms).
J
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [ofa-general] [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-06 8:35 ` Avi Kivity
2007-09-06 8:41 ` Gleb Natapov
@ 2007-09-10 18:17 ` Andrew Hastings
2007-09-11 10:37 ` Daniel J Blueman
1 sibling, 1 reply; 20+ messages in thread
From: Andrew Hastings @ 2007-09-10 18:17 UTC (permalink / raw)
To: Avi Kivity; +Cc: Daniel Blueman, linux-mm
Avi Kivity wrote:
> Gleb Natapov wrote:
>> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>>
>>> This sample patch adds a new mechanism, pte notifiers, that allows
>>> drivers
>>> to register an interest in a changes to ptes. Whenever Linux changes a
>>> pte, it will call a notifier to allow the driver to adjust the external
>>> page table and flush its tlb.
>>>
>> How is this different from http://lwn.net/Articles/133627/? AFAIR the
>> patch was rejected because there was only one user for it and it was
>> decided that it would be better to maintain it out of tree for a while.
>>
>
> Your patch is more complete.
>
> There are now at least three users: you, kvm, and newer Infiniband
> HCAs. Care to resurrect the patch?
We (Cray) also use the ioproc patch. AFAIK the current maintainer is
Dan Blueman at Quadrics.
-Andrew Hastings
Cray Inc.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [ofa-general] [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-10 18:17 ` Andrew Hastings
@ 2007-09-11 10:37 ` Daniel J Blueman
2007-09-11 11:19 ` Gleb Natapov
0 siblings, 1 reply; 20+ messages in thread
From: Daniel J Blueman @ 2007-09-11 10:37 UTC (permalink / raw)
To: Avi Kivity; +Cc: Andrew Hastings, linux-mm
Andrew Hastings wrote:
> Avi Kivity wrote:
>> Gleb Natapov wrote:
>>> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>>>
>>>> This sample patch adds a new mechanism, pte notifiers, that allows
>>>> drivers
>>>> to register an interest in a changes to ptes. Whenever Linux changes a
>>>> pte, it will call a notifier to allow the driver to adjust the external
>>>> page table and flush its tlb.
>>>>
>>> How is this different from http://lwn.net/Articles/133627/? AFAIR the
>>> patch was rejected because there was only one user for it and it was
>>> decided that it would be better to maintain it out of tree for a while.
>>>
>>
>> Your patch is more complete.
>>
>> There are now at least three users: you, kvm, and newer Infiniband
>> HCAs. Care to resurrect the patch?
>
> We (Cray) also use the ioproc patch. AFAIK the current maintainer is
> Dan Blueman at Quadrics.
I should add that the IOPROC patches are maintained internally to
loosely track mainline kernels; however, we do not generally release [1]
these until they've passed quite a lot of validation (driven by customer
demand mostly) on various configurations.
Quite a few large users/groups would benefit from this; the IOPROC
patches have been stable for quite a while now, so are a good option.
If you have any feedback/suggestions that would help forward progress,
I'm happy to hear and address them.
Thanks,
Daniel
--- [1]
http://www.quadrics.com/patches
--
Daniel J Blueman
Software Engineer, Quadrics Ltd
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [ofa-general] [PATCH][RFC]: pte notifiers -- support for external page tables
2007-09-11 10:37 ` Daniel J Blueman
@ 2007-09-11 11:19 ` Gleb Natapov
0 siblings, 0 replies; 20+ messages in thread
From: Gleb Natapov @ 2007-09-11 11:19 UTC (permalink / raw)
To: Daniel J Blueman; +Cc: Avi Kivity, Andrew Hastings, linux-mm
On Tue, Sep 11, 2007 at 11:37:50AM +0100, Daniel J Blueman wrote:
> Andrew Hastings wrote:
>> Avi Kivity wrote:
>>> Gleb Natapov wrote:
>>>> On Wed, Sep 05, 2007 at 07:38:48PM +0300, Avi Kivity wrote:
>>>>
>>>>> This sample patch adds a new mechanism, pte notifiers, that allows
>>>>> drivers
>>>>> to register an interest in a changes to ptes. Whenever Linux changes a
>>>>> pte, it will call a notifier to allow the driver to adjust the external
>>>>> page table and flush its tlb.
>>>>>
>>>> How is this different from http://lwn.net/Articles/133627/? AFAIR the
>>>> patch was rejected because there was only one user for it and it was
>>>> decided that it would be better to maintain it out of tree for a while.
>>>>
>>>
>>> Your patch is more complete.
>>>
>>> There are now at least three users: you, kvm, and newer Infiniband HCAs.
>>> Care to resurrect the patch?
>> We (Cray) also use the ioproc patch. AFAIK the current maintainer is Dan
>> Blueman at Quadrics.
>
> I should add that the IOPROC patches are maintained internally to loosely
> track mainline kernels; however, we do not generally release [1] these
> until they've passed quite a lot of validation (driven by customer demand
> mostly) on various configurations.
>
> Quite a few large users/groups would benefit from this; the IOPROC patches
> have been stable for quite a while now, so are a good option.
>
> If you have any feedback/suggestions that would help forward progress, I'm
> happy to hear and address them.
>
Posting the patch against current kernel (-mm or mainline) here would
be certainly helpful.
Thanks,
--
Gleb.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC] pte notifiers -- support for external page tables
[not found] ` <p73myw09g5w.fsf@bingen.suse.de>
@ 2007-09-06 15:17 ` Avi Kivity
0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-06 15:17 UTC (permalink / raw)
To: Andi Kleen; +Cc: linux-kernel, linux-mm, kvm-devel, general
Andi Kleen wrote:
> Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> writes:
>
>> pte notifiers are different from paravirt_ops: they extend the normal
>> page tables rather than replace them; and they provide high-level information
>> such as the vma and the virtual address for the driver to use.
>>
>
> Sounds like a locking horror to me. To do anything with page tables
> you need locks. Both for the kernel page tables and for your new tables.
>
> What happens when people add all
> things of complicated operations in these notifiers? That will likely
> happen and then everytime you change something in VM code they
> will break. This has the potential to increase the cost of maintaining
> VM code considerably, which would be a bad thing.
>
> This is quite different from paravirt ops because low level pvops
> can typically run lockless by just doing some kind of hypercall directly.
> But that won't work for maintaining your custom page tables.
>
Okay, here's a possible fix: add ->lock() and ->unlock() callbacks, to
be called when mmap_sem is taken either for read or write. Also add a
->release() for when the mm goes away to avoid the need to care about
the entire data structure going away.
The notifier list would need to be kept sorted to avoid deadlocks.
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC] pte notifiers -- support for external page tables
2007-09-05 19:32 [PATCH][RFC] " Avi Kivity
2007-09-06 4:28 ` Shaohua Li
@ 2007-09-06 13:28 ` Andi Kleen
[not found] ` <p73myw09g5w.fsf@bingen.suse.de>
2 siblings, 0 replies; 20+ messages in thread
From: Andi Kleen @ 2007-09-06 13:28 UTC (permalink / raw)
To: Avi Kivity, linux-kernel, linux-mm
Avi Kivity <avi-atKUWr5tajBWk0Htik3J/w@public.gmane.org> writes:
>
> pte notifiers are different from paravirt_ops: they extend the normal
> page tables rather than replace them; and they provide high-level information
> such as the vma and the virtual address for the driver to use.
Sounds like a locking horror to me. To do anything with page tables
you need locks. Both for the kernel page tables and for your new tables.
What happens when people add all
things of complicated operations in these notifiers? That will likely
happen and then everytime you change something in VM code they
will break. This has the potential to increase the cost of maintaining
VM code considerably, which would be a bad thing.
This is quite different from paravirt ops because low level pvops
can typically run lockless by just doing some kind of hypercall directly.
But that won't work for maintaining your custom page tables.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC] pte notifiers -- support for external page tables
2007-09-06 4:28 ` Shaohua Li
@ 2007-09-06 8:38 ` Avi Kivity
0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-06 8:38 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-kernel, linux-mm, kvm-devel, general
Shaohua Li wrote:
> On Wed, 2007-09-05 at 22:32 +0300, Avi Kivity wrote:
>
>> [resend due to bad alias expansion resulting in some recipients
>> being bogus]
>>
>> Some hardware and software systems maintain page tables outside the normal
>> Linux page tables, which reference userspace memory. This includes
>> Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
>>
>> Because these systems maintain external page tables (and external tlbs),
>> Linux cannot demand page this memory and it must be locked. For kvm at
>> least, this is a significant reduction in functionality.
>>
>> This sample patch adds a new mechanism, pte notifiers, that allows drivers
>> to register an interest in a changes to ptes. Whenever Linux changes a
>> pte, it will call a notifier to allow the driver to adjust the external
>> page table and flush its tlb.
>>
>> Note that only one notifier is implemented, ->clear(), but others should be
>> similar.
>>
>> pte notifiers are different from paravirt_ops: they extend the normal
>> page tables rather than replace them; and they provide high-level
>> information
>> such as the vma and the virtual address for the driver to use.
>>
> Looks great. So for kvm, all guest pages will be vma mapped?
> There are lock issues in kvm between kvm lock and page lock.
>
Yes, locking will be a headache.
> Will shadow page table be still stored in page->private? If yes, the
> page->private must be cleaned before add_to_swap.
>
page->private can be in use by filesystems, so we will need to move rmap
somewhere else.
--
Any sufficiently difficult bug is indistinguishable from a feature.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH][RFC] pte notifiers -- support for external page tables
2007-09-05 19:32 [PATCH][RFC] " Avi Kivity
@ 2007-09-06 4:28 ` Shaohua Li
2007-09-06 8:38 ` Avi Kivity
2007-09-06 13:28 ` Andi Kleen
[not found] ` <p73myw09g5w.fsf@bingen.suse.de>
2 siblings, 1 reply; 20+ messages in thread
From: Shaohua Li @ 2007-09-06 4:28 UTC (permalink / raw)
To: Avi Kivity; +Cc: linux-kernel, linux-mm, kvm-devel, general
On Wed, 2007-09-05 at 22:32 +0300, Avi Kivity wrote:
> [resend due to bad alias expansion resulting in some recipients
> being bogus]
>
> Some hardware and software systems maintain page tables outside the normal
> Linux page tables, which reference userspace memory. This includes
> Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
>
> Because these systems maintain external page tables (and external tlbs),
> Linux cannot demand page this memory and it must be locked. For kvm at
> least, this is a significant reduction in functionality.
>
> This sample patch adds a new mechanism, pte notifiers, that allows drivers
> to register an interest in a changes to ptes. Whenever Linux changes a
> pte, it will call a notifier to allow the driver to adjust the external
> page table and flush its tlb.
>
> Note that only one notifier is implemented, ->clear(), but others should be
> similar.
>
> pte notifiers are different from paravirt_ops: they extend the normal
> page tables rather than replace them; and they provide high-level
> information
> such as the vma and the virtual address for the driver to use.
Looks great. So for kvm, all guest pages will be vma mapped?
There are lock issues in kvm between kvm lock and page lock.
Will shadow page table be still stored in page->private? If yes, the
page->private must be cleaned before add_to_swap.
Thanks,
Shaohua
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH][RFC] pte notifiers -- support for external page tables
@ 2007-09-05 19:32 Avi Kivity
2007-09-06 4:28 ` Shaohua Li
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Avi Kivity @ 2007-09-05 19:32 UTC (permalink / raw)
To: linux-kernel, linux-mm; +Cc: kvm-devel, general, shaohua.li, Avi Kivity
[resend due to bad alias expansion resulting in some recipients
being bogus]
Some hardware and software systems maintain page tables outside the normal
Linux page tables, which reference userspace memory. This includes
Infiniband, other RDMA-capable devices, and kvm (with a pending patch).
Because these systems maintain external page tables (and external tlbs),
Linux cannot demand page this memory and it must be locked. For kvm at
least, this is a significant reduction in functionality.
This sample patch adds a new mechanism, pte notifiers, that allows drivers
to register an interest in a changes to ptes. Whenever Linux changes a
pte, it will call a notifier to allow the driver to adjust the external
page table and flush its tlb.
Note that only one notifier is implemented, ->clear(), but others should be
similar.
pte notifiers are different from paravirt_ops: they extend the normal
page tables rather than replace them; and they provide high-level information
such as the vma and the virtual address for the driver to use.
Signed-off-by: Avi Kivity <avi@qumranet.com>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 655094d..5d2bbee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -14,6 +14,7 @@
#include <linux/debug_locks.h>
#include <linux/backing-dev.h>
#include <linux/mm_types.h>
+#include <linux/pte_notifier.h>
struct mempolicy;
struct anon_vma;
@@ -108,6 +109,9 @@ struct vm_area_struct {
#ifndef CONFIG_MMU
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
#endif
+#ifdef CONFIG_PTE_NOTIFIERS
+ struct list_head pte_notifier_list;
+#endif
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
diff --git a/include/linux/pte_notifier.h b/include/linux/pte_notifier.h
new file mode 100644
index 0000000..d28832b
--- /dev/null
+++ b/include/linux/pte_notifier.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_PTE_NOTIFIER_H
+#define _LINUX_PTE_NOTIFIER_H
+
+#include <linux/list.h>
+
+struct vm_area_struct;
+
+#ifdef CONFIG_PTE_NOTIFIERS
+
+struct pte_notifier;
+
+struct pte_notifier_ops {
+ void (*close)(struct pte_notifier *pn, struct vm_area_struct *vma);
+ void (*clear)(struct pte_notifier *pn, struct vm_area_struct *vma,
+ unsigned long address);
+};
+
+struct pte_notifier {
+ struct list_head link;
+ const struct pte_notifier_ops *ops;
+};
+
+
+void vma_init_pte_notifiers(struct vm_area_struct *vma);
+void vma_close_pte_notifiers(struct vm_area_struct *vma);
+void pte_notifier_register(struct pte_notifier *pn,
+ struct vm_area_struct *vma);
+void pte_notifier_unregister(struct pte_notifier *pn);
+
+#define pte_notifier_call(vma, function, args...) \
+ do { \
+ struct pte_notifier *__pn; \
+ \
+ list_for_each_entry(__pn, &vma->pte_notifier_list, link) \
+ __pn->ops->function(__pn, vma, args); \
+ } while (0)
+
+#else
+
+static inline void vma_init_pte_notifiers(struct vm_area_struct *vma) {}
+static inline void vma_close_pte_notifiers(struct vm_area_struct *vma) {}
+static inline void pte_notifier_register(struct pte_notifier *pn,
+ struct vm_area_struct *vma) {}
+static inline void pte_notifier_unregister(struct pte_notifier *pn) {}
+
+#define pte_notifier_call(vma, function, args...) \
+ do { } while (0)
+
+#endif
+
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index e24d348..7b10151 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -176,3 +176,6 @@ config NR_QUICK
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config PTE_NOTIFIERS
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index 245e33a..59f6a03 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -29,4 +29,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_PTE_NOTIFIERS) += pte_notifiers.o
diff --git a/mm/mmap.c b/mm/mmap.c
index b653721..cc6c4fe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1134,6 +1134,7 @@ munmap_back:
vma->vm_page_prot = protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma->vm_pgoff = pgoff;
+ vma_init_pte_notifiers(vma);
if (file) {
error = -EINVAL;
diff --git a/mm/pte_notifier.c b/mm/pte_notifier.c
new file mode 100644
index 0000000..0b9076c
--- /dev/null
+++ b/mm/pte_notifier.c
@@ -0,0 +1,32 @@
+
+#include <linux/pte_notifier.h>
+
+void vma_init_pte_notifiers(struct vm_area_struct *vma)
+{
+ INIT_LIST_HEAD(&vma->pte_notifier_list);
+}
+EXPORT_SYMBOL_GPL(vma_init_pte_notifiers);
+
+void vma_destroy_pte_notifiers(struct vm_area_struct *vma)
+{
+ struct pte_notifier *pn;
+ struct list_head *n;
+
+ list_for_each_entry_safe(pn, n, &vma->pte_notifier_list, link) {
+ pn->ops->close(__pn, vma);
+ __list_del(n);
+ }
+}
+
+void pte_notifier_register(struct pte_notifier *pn, struct vm_area_struct *vma)
+{
+ list_add(&pn->link, &vma->pte_notifier_list);
+}
+EXPORT_SYMBOL_GPL(pte_notifier_register);
+
+void pte_notifier_unregister(struct pte_notifier *pn)
+{
+ list_del(&pn->link);
+}
+EXPORT_SYMBOL_GPL(pte_notifier_unregister);
+
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac397..3f61d38 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -682,6 +682,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/* Nuke the page table entry. */
+ pte_notifier_call(vma, clear, address);
flush_cache_page(vma, address, page_to_pfn(page));
pteval = ptep_clear_flush(vma, address, pte);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2007-09-11 11:19 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-09-05 16:38 [PATCH][RFC]: pte notifiers -- support for external page tables Avi Kivity
2007-09-05 19:05 ` Rik van Riel
2007-09-05 19:14 ` Avi Kivity
2007-09-05 19:23 ` Rik van Riel
2007-09-05 19:32 ` Avi Kivity
2007-09-06 11:28 ` Jeremy Fitzhardinge
2007-09-05 20:40 ` Jack Steiner
2007-09-05 20:40 ` Avi Kivity
2007-09-05 20:42 ` Avi Kivity
2007-09-06 6:24 ` [ofa-general] " Gleb Natapov
2007-09-06 8:35 ` Avi Kivity
2007-09-06 8:41 ` Gleb Natapov
2007-09-10 18:17 ` Andrew Hastings
2007-09-11 10:37 ` Daniel J Blueman
2007-09-11 11:19 ` Gleb Natapov
2007-09-05 19:32 [PATCH][RFC] " Avi Kivity
2007-09-06 4:28 ` Shaohua Li
2007-09-06 8:38 ` Avi Kivity
2007-09-06 13:28 ` Andi Kleen
[not found] ` <p73myw09g5w.fsf@bingen.suse.de>
2007-09-06 15:17 ` Avi Kivity
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox