linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* mmu notifiers
@ 2008-01-09 18:19 Andrea Arcangeli
  2008-01-09 21:54 ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Andrea Arcangeli @ 2008-01-09 18:19 UTC (permalink / raw)
  To: kvm-devel, linux-mm; +Cc: Daniel J Blueman

Hello,

This patch is a first basic implementation of the mmu notifiers. More
methods can be added in the future.

In short when the linux VM decides to free a page, it will unmap it
from the linux pagetables. However when a page is mapped not just by
the regular linux ptes, but also from the shadow pagetables, it's
currently unfreeable by the linux VM.

This patch allows the shadow pagetables to be dropped and the page to
be freed after that, if the linux VM decides to unmap the page from
the main ptes because it wants to swap out the page.

In my basic initial patch I only track the tlb flushes which should be
the minimum required to have a nice linux-VM controlled swapping
behavior of the KVM gphysical memory. The shadow-ptes works much like
a TLB, so the same way we flush the tlb after clearing the ptes, we
should also issue the mmu_notifier invalidate_page/range/release
methods. Quadrics needs much more than that to optimize things but
it's easy to add more methods to the below code to fit their needs if
the basic is ok.

This follows the model of Avi's original patch, however I guess it
would also be possible to track when the VM shrink_cache methods wants
to free a certain host-page_t instead of tracking when the tlb is
flushed. Not sure what's better, but the below should be enough for
KVM to swap nicely with minimal overhead to the host kernel even if
KVM is unused.

About the locking perhaps I'm underestimating it, but by following the
TLB flushing analogy, by simply clearing the shadow ptes (with kvm
mmu_lock spinlock) and flushing the shadow-pte after clearing the main
linux pte, it should be enough to serialize against shadow-pte page
faults that would call into get_user_pages. Flushing the host TLB
before or after the shadow-ptes shouldn't matter.

Comments welcome... especially from Quadrics. Patch is mostly
untested, tomorrow I'll try to plug KVM on top of the below and see if
it survives swap.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -86,6 +86,7 @@ do {									\
 	pte_t __pte;							\
 	__pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep);	\
 	flush_tlb_page(__vma, __address);				\
+	mmu_notifier(invalidate_page, (__vma)->vm_mm, __address);	\
 	__pte;								\
 })
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
 #include <linux/security.h>
+#include <linux/mmu_notifier.h>
 
 struct mempolicy;
 struct anon_vma;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -219,6 +219,10 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+	struct hlist_head mmu_notifier; /* MMU notifier list */
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
new file mode 100644
--- /dev/null
+++ b/include/linux/mmu_notifier.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/mm_types.h>
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+	void (*release)(struct mmu_notifier * mn,
+			struct mm_struct *mm);
+	void (*invalidate_page)(struct mmu_notifier * mn,
+				struct mm_struct *mm,
+				unsigned long address);
+	void (*invalidate_range)(struct mmu_notifier * mn,
+				 struct mm_struct *mm,
+				 unsigned long start, unsigned long end);
+};
+
+struct mmu_notifier {
+	struct hlist_node hlist;
+	const struct mmu_notifier_ops *ops;
+};
+
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+				  struct mm_struct *mm);
+extern void mmu_notifier_unregister(struct mmu_notifier *mn);
+extern void mmu_notifier_release(struct mm_struct *mm);
+
+#define mmu_notifier(function, mm, args...)				\
+	do {								\
+		struct mmu_notifier *__mn;				\
+		struct hlist_node *__n;					\
+									\
+		hlist_for_each_entry(__mn, __n, &(mm)->mmu_notifier, hlist) \
+			if (__mn->ops->function)			\
+				__mn->ops->function(__mn, mm, args);	\
+	} while (0)
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#define mmu_notifier_unregister(mn) do {} while (0)
+#define mmu_notifier_release(mm) do {} while (0)
+
+#define mmu_notifier(function, mm, args...)	\
+	do { } while (0)
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,3 +193,7 @@ config VIRT_TO_BUS
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	def_bool y
+	bool "MMU notifier, for paging KVM/RDMA"
diff --git a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,4 +30,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -753,6 +753,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier(invalidate_range, mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -889,6 +889,7 @@ unsigned long zap_page_range(struct vm_a
 	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
 	if (tlb)
 		tlb_finish_mmu(tlb, address, end);
+	mmu_notifier(invalidate_range, mm, address, end);
 	return end;
 }
 
@@ -1358,6 +1359,7 @@ int remap_pfn_range(struct vm_area_struc
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier(invalidate_range, mm, end-PAGE_ALIGN(size), end);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1452,6 +1454,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier(invalidate_range, mm, end-size, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
diff --git a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1747,6 +1747,7 @@ static void unmap_region(struct mm_struc
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
 				 next? next->vm_start: 0);
 	tlb_finish_mmu(tlb, start, end);
+	mmu_notifier(invalidate_range, mm, start, end);
 }
 
 /*
@@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
 	tlb_finish_mmu(tlb, 0, end);
+	mmu_notifier_release(mm);
 
 	/*
 	 * Walk the list again, actually closing and freeing it,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,35 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+
+void mmu_notifier_release(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+	struct hlist_node *n, *tmp;
+
+	hlist_for_each_entry_safe(mn, n, tmp, &mm->mmu_notifier, hlist) {
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+		hlist_del(n);
+	}
+}
+
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	hlist_add_head(&mn->hlist, &mm->mmu_notifier);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn)
+{
+	hlist_del(&mn->hlist);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: mmu notifiers
  2008-01-09 18:19 mmu notifiers Andrea Arcangeli
@ 2008-01-09 21:54 ` Christoph Lameter
  2008-01-10 11:44   ` [kvm-devel] " Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-09 21:54 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: kvm-devel, linux-mm, Daniel J Blueman

On Wed, 9 Jan 2008, Andrea Arcangeli wrote:

> This patch is a first basic implementation of the mmu notifiers. More
> methods can be added in the future.
> 
> In short when the linux VM decides to free a page, it will unmap it
> from the linux pagetables. However when a page is mapped not just by
> the regular linux ptes, but also from the shadow pagetables, it's
> currently unfreeable by the linux VM.

Such a patch would also address issues that SGI has with exporting 
mappings via XPMEM. Plus a variety of other uses. Go ahead and lets do 
more in this area.

Are the KVM folks interested in exporting memory from one guest to 
another? That may also become possible with some of the work that we have 
in progress and that also requires a patch like this.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-09 21:54 ` Christoph Lameter
@ 2008-01-10 11:44   ` Avi Kivity
  2008-01-10 13:16     ` Robin Holt
  2008-01-10 19:04     ` Christoph Lameter
  0 siblings, 2 replies; 23+ messages in thread
From: Avi Kivity @ 2008-01-10 11:44 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

Christoph Lameter wrote:
> On Wed, 9 Jan 2008, Andrea Arcangeli wrote:
>
>   
>> This patch is a first basic implementation of the mmu notifiers. More
>> methods can be added in the future.
>>
>> In short when the linux VM decides to free a page, it will unmap it
>> from the linux pagetables. However when a page is mapped not just by
>> the regular linux ptes, but also from the shadow pagetables, it's
>> currently unfreeable by the linux VM.
>>     
>
> Such a patch would also address issues that SGI has with exporting 
> mappings via XPMEM. Plus a variety of other uses. Go ahead and lets do 
> more in this area.
>
> Are the KVM folks interested in exporting memory from one guest to 
> another? That may also become possible with some of the work that we have 
> in progress and that also requires a patch like this.
>
>   

Actually sharing memory is possible even without this patch; one simply 
mmap()s a file into the address space of both guests.  Or are you 
referring to something else?

The patch does enable some nifty things; one example you may be familiar 
with is using page migration to move a guest from one numa node to another.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 11:44   ` [kvm-devel] " Avi Kivity
@ 2008-01-10 13:16     ` Robin Holt
  2008-01-10 13:27       ` Avi Kivity
  2008-01-10 19:04     ` Christoph Lameter
  1 sibling, 1 reply; 23+ messages in thread
From: Robin Holt @ 2008-01-10 13:16 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Christoph Lameter, Andrea Arcangeli, kvm-devel, linux-mm,
	Daniel J Blueman

On Thu, Jan 10, 2008 at 01:44:18PM +0200, Avi Kivity wrote:
> Christoph Lameter wrote:
>> On Wed, 9 Jan 2008, Andrea Arcangeli wrote:
>>
>>   
>>> This patch is a first basic implementation of the mmu notifiers. More
>>> methods can be added in the future.
>>>
>>> In short when the linux VM decides to free a page, it will unmap it
>>> from the linux pagetables. However when a page is mapped not just by
>>> the regular linux ptes, but also from the shadow pagetables, it's
>>> currently unfreeable by the linux VM.
>>>     
>>
>> Such a patch would also address issues that SGI has with exporting 
>> mappings via XPMEM. Plus a variety of other uses. Go ahead and lets do 
>> more in this area.
>>
>> Are the KVM folks interested in exporting memory from one guest to 
>> another? That may also become possible with some of the work that we have 
>> in progress and that also requires a patch like this.
>>
>>   
>
> Actually sharing memory is possible even without this patch; one simply 
> mmap()s a file into the address space of both guests.  Or are you referring 
> to something else?

He is referring to the xpmem work SGI has pushed in the past.  It was
rejected precisely because this type functionality did not exist.  We were
trying to determine the cleanest yet smallest acceptable implementation
when this suddenly sprang up.  I would expect Dean Nelson or myself to
repost the xpmem patch set again based upon this patche.

> The patch does enable some nifty things; one example you may be familiar 
> with is using page migration to move a guest from one numa node to another.

xpmem allows one MPI rank to "export" his address space, a different
MPI rank to "import" that address space, and they share the same pages.
This allows sharing of things like stack and heap space.  XPMEM also
provides a mechanism to share that PFN information across partition
boundaries so the pages become available on a different host.  This,
of course, is dependent upon hardware that supports direct access to
the memory by the processor.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 13:16     ` Robin Holt
@ 2008-01-10 13:27       ` Avi Kivity
  2008-01-10 14:50         ` Robin Holt
  2008-01-10 19:06         ` Christoph Lameter
  0 siblings, 2 replies; 23+ messages in thread
From: Avi Kivity @ 2008-01-10 13:27 UTC (permalink / raw)
  To: Robin Holt
  Cc: Christoph Lameter, Andrea Arcangeli, kvm-devel, linux-mm,
	Daniel J Blueman

Robin Holt wrote:
>
>> The patch does enable some nifty things; one example you may be familiar 
>> with is using page migration to move a guest from one numa node to another.
>>     
>
> xpmem allows one MPI rank to "export" his address space, a different
> MPI rank to "import" that address space, and they share the same pages.
> This allows sharing of things like stack and heap space.  XPMEM also
> provides a mechanism to share that PFN information across partition
> boundaries so the pages become available on a different host.  This,
> of course, is dependent upon hardware that supports direct access to
> the memory by the processor.
>
>   

So this is yet another instance of hardware that has a tlb that needs to 
be kept in sync with the page tables, yes?

Excellent, the more users the patch has, the easier it will be to 
justify it.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 13:27       ` Avi Kivity
@ 2008-01-10 14:50         ` Robin Holt
  2008-01-10 19:06         ` Christoph Lameter
  1 sibling, 0 replies; 23+ messages in thread
From: Robin Holt @ 2008-01-10 14:50 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Robin Holt, Christoph Lameter, Andrea Arcangeli, kvm-devel,
	linux-mm, Daniel J Blueman

On Thu, Jan 10, 2008 at 03:27:24PM +0200, Avi Kivity wrote:
> Robin Holt wrote:
>>
>>> The patch does enable some nifty things; one example you may be familiar 
>>> with is using page migration to move a guest from one numa node to 
>>> another.
>>>     
>>
>> xpmem allows one MPI rank to "export" his address space, a different
>> MPI rank to "import" that address space, and they share the same pages.
>> This allows sharing of things like stack and heap space.  XPMEM also
>> provides a mechanism to share that PFN information across partition
>> boundaries so the pages become available on a different host.  This,
>> of course, is dependent upon hardware that supports direct access to
>> the memory by the processor.
>>
>>   
>
> So this is yet another instance of hardware that has a tlb that needs to be 
> kept in sync with the page tables, yes?

Yep, the external TLBs happen to be cpus in a different OS instance,
but you get the idea.

> Excellent, the more users the patch has, the easier it will be to justify 
> it.

I think we have another hardware device driver that will use it first.
It is sort of a hardware coprocessor that is available from user space
to do operations against a processes address space.  That driver will
probably be first out the door.

Looking at the mmu_notifiers patch, there are locks held which will
preclude the use of invalidate_page for xpmem.  In that circumstance,
the clearing operation will need to be messaged to the other OS instance
and that will certainly involving putting the current task to sleep.

We will work on that detail later.  First, we will focus on getting the
other driver submitted to the community.

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 11:44   ` [kvm-devel] " Avi Kivity
  2008-01-10 13:16     ` Robin Holt
@ 2008-01-10 19:04     ` Christoph Lameter
  2008-01-12 19:51       ` Avi Kivity
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-10 19:04 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

On Thu, 10 Jan 2008, Avi Kivity wrote:

> Actually sharing memory is possible even without this patch; one simply
> mmap()s a file into the address space of both guests.  Or are you referring to
> something else?

A file from where? If a file is read by two guests then they will have 
distinct page structs.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 13:27       ` Avi Kivity
  2008-01-10 14:50         ` Robin Holt
@ 2008-01-10 19:06         ` Christoph Lameter
  2008-01-12 19:56           ` Avi Kivity
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-10 19:06 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Robin Holt, Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

On Thu, 10 Jan 2008, Avi Kivity wrote:

> So this is yet another instance of hardware that has a tlb that needs to be
> kept in sync with the page tables, yes?

Correct. 

> Excellent, the more users the patch has, the easier it will be to justify it.

We'd like to make sure though that we can sleep when the hooks have been 
called. We may have to sent a message to kick remote ptes out when local 
pte changes happen.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 19:04     ` Christoph Lameter
@ 2008-01-12 19:51       ` Avi Kivity
  2008-01-13 12:09         ` Robin Holt
  2008-01-14 19:49         ` Christoph Lameter
  0 siblings, 2 replies; 23+ messages in thread
From: Avi Kivity @ 2008-01-12 19:51 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

Christoph Lameter wrote:
> On Thu, 10 Jan 2008, Avi Kivity wrote:
>
>   
>> Actually sharing memory is possible even without this patch; one simply
>> mmap()s a file into the address space of both guests.  Or are you referring to
>> something else?
>>     
>
> A file from where? If a file is read by two guests then they will have 
> distinct page structs.
>
>   

Two kvm instances mmap() the file (from anywhere) into the guest address 
space.  That memory is shared, and will be backed by the same page 
structs at the same offset.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-10 19:06         ` Christoph Lameter
@ 2008-01-12 19:56           ` Avi Kivity
  0 siblings, 0 replies; 23+ messages in thread
From: Avi Kivity @ 2008-01-12 19:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Robin Holt, Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

Christoph Lameter wrote:
>   
>> Excellent, the more users the patch has, the easier it will be to justify it.
>>     
>
> We'd like to make sure though that we can sleep when the hooks have been 
> called. We may have to sent a message to kick remote ptes out when local 
> pte changes happen.
>
>   

It may be as simple as moving the notifier calls down to a sleeping 
context, away from the pte lock and any friends.

kvm also needs to send a message on an mmu notification, but that's just 
an IPI within the same host.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-12 19:51       ` Avi Kivity
@ 2008-01-13 12:09         ` Robin Holt
  2008-01-13 12:28           ` Avi Kivity
  2008-01-14 19:49         ` Christoph Lameter
  1 sibling, 1 reply; 23+ messages in thread
From: Robin Holt @ 2008-01-13 12:09 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Christoph Lameter, Andrea Arcangeli, kvm-devel, linux-mm,
	Daniel J Blueman

On Sat, Jan 12, 2008 at 09:51:56PM +0200, Avi Kivity wrote:
> Christoph Lameter wrote:
>> On Thu, 10 Jan 2008, Avi Kivity wrote:
>>
>>   
>>> Actually sharing memory is possible even without this patch; one simply
>>> mmap()s a file into the address space of both guests.  Or are you 
>>> referring to
>>> something else?
>>>     
>>
>> A file from where? If a file is read by two guests then they will have 
>> distinct page structs.
>>
>>   
>
> Two kvm instances mmap() the file (from anywhere) into the guest address 
> space.  That memory is shared, and will be backed by the same page structs 
> at the same offset.

That sounds nice, but...

For larger machine configurations, we have different memory access
capabilities.  When a partition that is located close to the home node
of the memory accesses memory, it is normal access.  When it is further
away, they get special access to the line.  Before the shared line is
sent to the reading node, it is converted by the memory controller into
an exclusive request and the reading node is handed the only copy of
the line.  If we gave a remote kernel access to the page, we would also
open the entire owning nodes page tables up to speculative references
which effectively would be viewed by hardware as cache-line contention.

Additionally, we have needs beyond memory backed by files.  Including
special devices which do not have struct pages at all (see mspec.c).

Thanks,
Robin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-13 12:09         ` Robin Holt
@ 2008-01-13 12:28           ` Avi Kivity
  2008-01-14 19:51             ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-01-13 12:28 UTC (permalink / raw)
  To: Robin Holt
  Cc: Christoph Lameter, Andrea Arcangeli, kvm-devel, linux-mm,
	Daniel J Blueman

Robin Holt wrote:
> On Sat, Jan 12, 2008 at 09:51:56PM +0200, Avi Kivity wrote:
>   
>> Christoph Lameter wrote:
>>     
>>> On Thu, 10 Jan 2008, Avi Kivity wrote:
>>>
>>>   
>>>       
>>>> Actually sharing memory is possible even without this patch; one simply
>>>> mmap()s a file into the address space of both guests.  Or are you 
>>>> referring to
>>>> something else?
>>>>     
>>>>         
>>> A file from where? If a file is read by two guests then they will have 
>>> distinct page structs.
>>>
>>>   
>>>       
>> Two kvm instances mmap() the file (from anywhere) into the guest address 
>> space.  That memory is shared, and will be backed by the same page structs 
>> at the same offset.
>>     
>
> That sounds nice, but...
>
> For larger machine configurations, we have different memory access
> capabilities.  When a partition that is located close to the home node
> of the memory accesses memory, it is normal access.  When it is further
> away, they get special access to the line.  Before the shared line is
> sent to the reading node, it is converted by the memory controller into
> an exclusive request and the reading node is handed the only copy of
> the line.  If we gave a remote kernel access to the page, we would also
> open the entire owning nodes page tables up to speculative references
> which effectively would be viewed by hardware as cache-line contention.
>
> Additionally, we have needs beyond memory backed by files.  Including
> special devices which do not have struct pages at all (see mspec.c).
>   

I don't understand.

I was just explaining how kvm shares memory among guests (which does not 
require mmu notifiers); if you have some other configuration that can 
benefit from mmu notifiers, then, well, great.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-12 19:51       ` Avi Kivity
  2008-01-13 12:09         ` Robin Holt
@ 2008-01-14 19:49         ` Christoph Lameter
  2008-01-15  7:38           ` Avi Kivity
  1 sibling, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-14 19:49 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

On Sat, 12 Jan 2008, Avi Kivity wrote:

> Two kvm instances mmap() the file (from anywhere) into the guest address
> space.  That memory is shared, and will be backed by the same page structs at
> the same offset.

Duh. Impossible. Two instances of Linux cannot share page structs. So how 
are you doing this? Or is this just an idea?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-13 12:28           ` Avi Kivity
@ 2008-01-14 19:51             ` Christoph Lameter
  2008-01-15  7:43               ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-14 19:51 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Robin Holt, Andrea Arcangeli, kvm-devel, linux-mm, Daniel J Blueman

On Sun, 13 Jan 2008, Avi Kivity wrote:

> I was just explaining how kvm shares memory among guests (which does not
> require mmu notifiers); if you have some other configuration that can benefit
> from mmu notifiers, then, well, great.

I think you have two page tables pointing to the same memory location 
right (not to page structs but two ptes)? Without a mmu notifier the pages 
in this memory range cannot be evicted because otherwise ptes of the other 
instance will point to a page that is now used for a different purpose.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-14 19:49         ` Christoph Lameter
@ 2008-01-15  7:38           ` Avi Kivity
  2008-01-15 17:39             ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-01-15  7:38 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

Christoph Lameter wrote:
> On Sat, 12 Jan 2008, Avi Kivity wrote:
>
>   
>> Two kvm instances mmap() the file (from anywhere) into the guest address
>> space.  That memory is shared, and will be backed by the same page structs at
>> the same offset.
>>     
>
> Duh. Impossible. Two instances of Linux cannot share page structs. So how 
> are you doing this? Or is this just an idea?
>
>   

I was describing one Linux host running two guest instances.  The page 
structs are in the host, so they are shared by mmap().

kvm userspace is just an ordinary host process, it can mmap() any file 
it likes and then assign that virtual memory range to the guest (as 
guest physical memory).

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-14 19:51             ` Christoph Lameter
@ 2008-01-15  7:43               ` Avi Kivity
  0 siblings, 0 replies; 23+ messages in thread
From: Avi Kivity @ 2008-01-15  7:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli, Robin Holt

Christoph Lameter wrote:
> On Sun, 13 Jan 2008, Avi Kivity wrote:
>
>   
>> I was just explaining how kvm shares memory among guests (which does not
>> require mmu notifiers); if you have some other configuration that can benefit
>> from mmu notifiers, then, well, great.
>>     
>
> I think you have two page tables pointing to the same memory location 
> right (not to page structs but two ptes)? Without a mmu notifier the pages 
> in this memory range cannot be evicted because otherwise ptes of the other 
> instance will point to a page that is now used for a different purpose.
>   

Even with just one guest we can't swap well without mmu notifiers.

kvm constructs new page tables for the guest that the Linux vm doesn't 
know about, so when Linux removes all the ptes, we need a callback to 
remove the kvm private ptes (and tlb entries).

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-15  7:38           ` Avi Kivity
@ 2008-01-15 17:39             ` Christoph Lameter
  2008-01-15 17:53               ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-15 17:39 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

On Tue, 15 Jan 2008, Avi Kivity wrote:

> > Duh. Impossible. Two instances of Linux cannot share page structs. So how
> > are you doing this? Or is this just an idea?
> 
> I was describing one Linux host running two guest instances.  The page structs
> are in the host, so they are shared by mmap().

Ahh.. Okay I was talking about a guest exporting its memory to another 
guest.
 
> kvm userspace is just an ordinary host process, it can mmap() any file it
> likes and then assign that virtual memory range to the guest (as guest
> physical memory).

But then the guest does not have its own page struct to manage the memory.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-15 17:39             ` Christoph Lameter
@ 2008-01-15 17:53               ` Avi Kivity
  2008-01-15 17:57                 ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-01-15 17:53 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

Christoph Lameter wrote:
> On Tue, 15 Jan 2008, Avi Kivity wrote:
>
>   
>>> Duh. Impossible. Two instances of Linux cannot share page structs. So how
>>> are you doing this? Or is this just an idea?
>>>       
>> I was describing one Linux host running two guest instances.  The page structs
>> are in the host, so they are shared by mmap().
>>     
>
> Ahh.. Okay I was talking about a guest exporting its memory to another 
> guest.
>   

That's not very different, if they are on the same host?

>  
>   
>> kvm userspace is just an ordinary host process, it can mmap() any file it
>> likes and then assign that virtual memory range to the guest (as guest
>> physical memory).
>>     
>
> But then the guest does not have its own page struct to manage the memory.
>
>   

Why not?  It's just a block of memory as far as the guest is concerned.  
It's entirely up to it whether to create page structs or not.

Example:

qemu 1:

   p = mmap("/dev/shm/blah", size, ... );
   ioctl(vm_fd, KVM_CREATE_MEMORY_REGION_USER, { p, size, 0x10000000, 
... });

qemu 2:

   p = mmap("/dev/shm/blah", size, ... );
   ioctl(vm_fd, KVM_CREATE_MEMORY_REGION_USER, { p, size, 0x10000000, 
... });

Physical address 0x10000000, of both guests, would map to the same page.

Of course, ordinary Linux kernels can't do much with memory that is 
shared with another guest.

I've a feeling we need a whiteboard.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-15 17:53               ` Avi Kivity
@ 2008-01-15 17:57                 ` Christoph Lameter
  2008-01-15 18:06                   ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-15 17:57 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

On Tue, 15 Jan 2008, Avi Kivity wrote:

> > Ahh.. Okay I was talking about a guest exporting its memory to another
> > guest.
> >   
> 
> That's not very different, if they are on the same host?

But each guest has its own page structs. They cannot share page structs. 
Concurrent access of two independent kernel instances for synchronization 
and status maintenance to a single page struct?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-15 17:57                 ` Christoph Lameter
@ 2008-01-15 18:06                   ` Avi Kivity
  2008-01-15 18:16                     ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-01-15 18:06 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

Christoph Lameter wrote:
> On Tue, 15 Jan 2008, Avi Kivity wrote:
>
>   
>>> Ahh.. Okay I was talking about a guest exporting its memory to another
>>> guest.
>>>   
>>>       
>> That's not very different, if they are on the same host?
>>     
>
> But each guest has its own page structs. They cannot share page structs. 
> Concurrent access of two independent kernel instances for synchronization 
> and status maintenance to a single page struct?
>   

There's a host page struct (that the guest know nothing about and cannot 
touch), and optionally a guest page struct for each guest (that the host 
and the other guest know nothing about).

The guest page struct is optional, since it is up to the guest to create 
it.  kvm doesn't care.  If the guest isn't Linux, there certainly won't 
be a page struct.

The host page struct may disappear if the host decides to swap the page 
into its backing store and free the page.  The guest page structs (if 
any) would remain.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-15 18:06                   ` Avi Kivity
@ 2008-01-15 18:16                     ` Christoph Lameter
  2008-01-16  7:39                       ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Lameter @ 2008-01-15 18:16 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

On Tue, 15 Jan 2008, Avi Kivity wrote:

> > But each guest has its own page structs. They cannot share page structs.
> > Concurrent access of two independent kernel instances for synchronization
> > and status maintenance to a single page struct?
> >   
> 
> There's a host page struct (that the guest know nothing about and cannot
> touch), and optionally a guest page struct for each guest (that the host and
> the other guest know nothing about).

Ok so if two linux guests want to share memory three page structs are 
involved:

1. Host page struct
2. Guest #1 page struct
3. Guest #2 page struct

I can understand that 1 and 2 point to the same physical page. Even all 
three could point to the same page if the page is readonly. 

However, lets say that Guest #1 allocates some anonymous memory and wants
to share it with Guest #2. In that case something like PFNMAP is likely
going to be used? Or are you remapping the physical page so that #1 and #2 
share it? In that case two page struct describe state of the same physical
page and we have no effective synchronization for writeback etc.

> The host page struct may disappear if the host decides to swap the page into
> its backing store and free the page.  The guest page structs (if any) would
> remain.

Page structs never disappear. The pte's may disappear and the page may be 
unmapped from an address space of a process but the page struct stays. 
Page struct can only disappear if memory hotplug is activated and memory 
is taken out of the system.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-15 18:16                     ` Christoph Lameter
@ 2008-01-16  7:39                       ` Avi Kivity
  2008-01-16 18:08                         ` Christoph Lameter
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2008-01-16  7:39 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

Christoph Lameter wrote:
> On Tue, 15 Jan 2008, Avi Kivity wrote:
>
>   
>>> But each guest has its own page structs. They cannot share page structs.
>>> Concurrent access of two independent kernel instances for synchronization
>>> and status maintenance to a single page struct?
>>>   
>>>       
>> There's a host page struct (that the guest know nothing about and cannot
>> touch), and optionally a guest page struct for each guest (that the host and
>> the other guest know nothing about).
>>     
>
> Ok so if two linux guests want to share memory three page structs are 
> involved:
>
> 1. Host page struct
> 2. Guest #1 page struct
> 3. Guest #2 page struct
>
> I can understand that 1 and 2 point to the same physical page. Even all 
> three could point to the same page if the page is readonly. 
>
> However, lets say that Guest #1 allocates some anonymous memory and wants
> to share it with Guest #2. In that case something like PFNMAP is likely
> going to be used? Or are you remapping the physical page so that #1 and #2 
> share it? In that case two page struct describe state of the same physical
> page and we have no effective synchronization for writeback etc.
>
>   

Like I said, out of the box Linux doesn't support using memory that is 
shared with other instances as main memory.  One usage  (by the s390 
folk) was to put a read-only filesystem with execute-in-place support on 
this memory, and so reduce the memory usage of guests.

>> The host page struct may disappear if the host decides to swap the page into
>> its backing store and free the page.  The guest page structs (if any) would
>> remain.
>>     
>
> Page structs never disappear. The pte's may disappear and the page may be 
> unmapped from an address space of a process but the page struct stays. 
> Page struct can only disappear if memory hotplug is activated and memory 
> is taken out of the system.
>   

Yes, that was poorly phrased.  The page and its page struct may be 
reallocated for other purposes.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kvm-devel] mmu notifiers
  2008-01-16  7:39                       ` Avi Kivity
@ 2008-01-16 18:08                         ` Christoph Lameter
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Lameter @ 2008-01-16 18:08 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel, linux-mm, Daniel J Blueman, Andrea Arcangeli

On Wed, 16 Jan 2008, Avi Kivity wrote:

> Yes, that was poorly phrased.  The page and its page struct may
be reallocated
> for other purposes.

Its better to say "reused". Otherwise one may think that an allocation of 
page structs is needed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2008-01-16 18:08 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-09 18:19 mmu notifiers Andrea Arcangeli
2008-01-09 21:54 ` Christoph Lameter
2008-01-10 11:44   ` [kvm-devel] " Avi Kivity
2008-01-10 13:16     ` Robin Holt
2008-01-10 13:27       ` Avi Kivity
2008-01-10 14:50         ` Robin Holt
2008-01-10 19:06         ` Christoph Lameter
2008-01-12 19:56           ` Avi Kivity
2008-01-10 19:04     ` Christoph Lameter
2008-01-12 19:51       ` Avi Kivity
2008-01-13 12:09         ` Robin Holt
2008-01-13 12:28           ` Avi Kivity
2008-01-14 19:51             ` Christoph Lameter
2008-01-15  7:43               ` Avi Kivity
2008-01-14 19:49         ` Christoph Lameter
2008-01-15  7:38           ` Avi Kivity
2008-01-15 17:39             ` Christoph Lameter
2008-01-15 17:53               ` Avi Kivity
2008-01-15 17:57                 ` Christoph Lameter
2008-01-15 18:06                   ` Avi Kivity
2008-01-15 18:16                     ` Christoph Lameter
2008-01-16  7:39                       ` Avi Kivity
2008-01-16 18:08                         ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox