* [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable [not found] <1206028710.6690.21.camel@cotte.boeblingen.de.ibm.com> @ 2008-03-20 16:24 ` Carsten Otte, Martin Schwidefsky 2008-03-20 17:28 ` Jeremy Fitzhardinge 2008-03-20 16:24 ` [RFC/PATCH 02/15] preparation: host memory management changes for s390 kvm Carsten Otte, Heiko Carstens, Christian Borntraeger 1 sibling, 1 reply; 14+ messages in thread From: Carsten Otte, Martin Schwidefsky @ 2008-03-20 16:24 UTC (permalink / raw) To: virtualization, kvm-devel, Avi Kivity, Linux Memory Management List Cc: schwidefsky, heiko.carstens, os, borntraeger, hollisb, EHRHARDT, jeroney, aliguori, jblunck, rvdheij, rusty, arnd, Zhang, Xiantao The SIE instruction on s390 uses the 2nd half of the page table page to virtualize the storage keys of a guest. This patch offers the s390_enable_sie function, which reorganizes the page tables of a single-threaded process to reserve space in the page table: s390_enable_sie makes sure that the process is single threaded and then uses dup_mm to create a new mm with reorganized page tables. The old mm is freed and the process has now a page status extended field after every page table. Code that wants to exploit pgstes should SELECT CONFIG_PGSTE. This patch has a small common code hit, namely making dup_mm non-static. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Carsten Otte <cotte@de.ibm.com> --- arch/s390/Kconfig | 4 ++ arch/s390/kernel/setup.c | 4 ++ arch/s390/mm/pgtable.c | 55 ++++++++++++++++++++++++++++++++++++++--- include/asm-s390/mmu.h | 1 include/asm-s390/mmu_context.h | 8 +++++ include/asm-s390/pgtable.h | 1 kernel/fork.c | 2 - 7 files changed, 70 insertions(+), 5 deletions(-) Index: kvm/arch/s390/Kconfig =================================================================== --- kvm.orig/arch/s390/Kconfig +++ kvm/arch/s390/Kconfig @@ -55,6 +55,10 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT +config PGSTE + bool + default y if KVM + mainmenu "Linux Kernel Configuration" config S390 Index: kvm/arch/s390/kernel/setup.c =================================================================== --- kvm.orig/arch/s390/kernel/setup.c +++ kvm/arch/s390/kernel/setup.c @@ -315,7 +315,11 @@ static int __init early_parse_ipldelay(c early_param("ipldelay", early_parse_ipldelay); #ifdef CONFIG_S390_SWITCH_AMODE +#ifdef CONFIG_PGSTE +unsigned int switch_amode = 1; +#else unsigned int switch_amode = 0; +#endif EXPORT_SYMBOL_GPL(switch_amode); static void set_amode_and_uaccess(unsigned long user_amode, Index: kvm/arch/s390/mm/pgtable.c =================================================================== --- kvm.orig/arch/s390/mm/pgtable.c +++ kvm/arch/s390/mm/pgtable.c @@ -30,11 +30,27 @@ #define TABLES_PER_PAGE 4 #define FRAG_MASK 15UL #define SECOND_HALVES 10UL + +void clear_table_pgstes(unsigned long *table) +{ + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); + memset(table + 256, 0, PAGE_SIZE/4); + clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); + memset(table + 768, 0, PAGE_SIZE/4); +} + #else #define ALLOC_ORDER 2 #define TABLES_PER_PAGE 2 #define FRAG_MASK 3UL #define SECOND_HALVES 2UL + +void clear_table_pgstes(unsigned long *table) +{ + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); + memset(table + 256, 0, PAGE_SIZE/2); +} + #endif unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) @@ -153,7 +169,7 @@ unsigned long *page_table_alloc(struct m unsigned long *table; unsigned long bits; - bits = mm->context.noexec ? 3UL : 1UL; + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; spin_lock(&mm->page_table_lock); page = NULL; if (!list_empty(&mm->context.pgtable_list)) { @@ -170,7 +186,10 @@ unsigned long *page_table_alloc(struct m pgtable_page_ctor(page); page->flags &= ~FRAG_MASK; table = (unsigned long *) page_to_phys(page); - clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); + if (mm->context.pgstes) + clear_table_pgstes(table); + else + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); spin_lock(&mm->page_table_lock); list_add(&page->lru, &mm->context.pgtable_list); } @@ -191,7 +210,7 @@ void page_table_free(struct mm_struct *m struct page *page; unsigned long bits; - bits = mm->context.noexec ? 3UL : 1UL; + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); page = pfn_to_page(__pa(table) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); @@ -228,3 +247,33 @@ void disable_noexec(struct mm_struct *mm mm->context.noexec = 0; update_mm(mm, tsk); } + +struct mm_struct *dup_mm(struct task_struct *tsk); + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct task_struct *tsk = current; + struct mm_struct *mm; + + if (tsk->mm->context.pgstes) + return 0; + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) + return -EINVAL; + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ + mm = dup_mm(tsk); + tsk->mm->context.pgstes = 0; + if (!mm) + return -ENOMEM; + mmput(tsk->mm); + tsk->mm = tsk->active_mm = mm; + preempt_disable(); + update_mm(mm, tsk); + cpu_set(smp_processor_id(), mm->cpu_vm_mask); + preempt_enable(); + return 0; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); Index: kvm/include/asm-s390/mmu.h =================================================================== --- kvm.orig/include/asm-s390/mmu.h +++ kvm/include/asm-s390/mmu.h @@ -7,6 +7,7 @@ typedef struct { unsigned long asce_bits; unsigned long asce_limit; int noexec; + int pgstes; } mm_context_t; #endif Index: kvm/include/asm-s390/mmu_context.h =================================================================== --- kvm.orig/include/asm-s390/mmu_context.h +++ kvm/include/asm-s390/mmu_context.h @@ -20,7 +20,13 @@ static inline int init_new_context(struc #ifdef CONFIG_64BIT mm->context.asce_bits |= _ASCE_TYPE_REGION3; #endif - mm->context.noexec = s390_noexec; + if (current->mm->context.pgstes) { + mm->context.noexec = 0; + mm->context.pgstes = 1; + } else { + mm->context.noexec = s390_noexec; + mm->context.pgstes = 0; + } mm->context.asce_limit = STACK_TOP_MAX; crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); return 0; Index: kvm/include/asm-s390/pgtable.h =================================================================== --- kvm.orig/include/asm-s390/pgtable.h +++ kvm/include/asm-s390/pgtable.h @@ -966,6 +966,7 @@ static inline pte_t mk_swap_pte(unsigned extern int add_shared_memory(unsigned long start, unsigned long size); extern int remove_shared_memory(unsigned long start, unsigned long size); +extern int s390_enable_sie(void); /* * No page table caches to initialise Index: kvm/kernel/fork.c =================================================================== --- kvm.orig/kernel/fork.c +++ kvm/kernel/fork.c @@ -498,7 +498,7 @@ void mm_release(struct task_struct *tsk, * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-20 16:24 ` [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable Carsten Otte, Martin Schwidefsky @ 2008-03-20 17:28 ` Jeremy Fitzhardinge 2008-03-20 19:13 ` Dave Hansen 0 siblings, 1 reply; 14+ messages in thread From: Jeremy Fitzhardinge @ 2008-03-20 17:28 UTC (permalink / raw) To: Carsten Otte Cc: virtualization, kvm-devel, Avi Kivity, Linux Memory Management List, aliguori, EHRHARDT, arnd, hollisb, heiko.carstens, jeroney, borntraeger, schwidefsky, rvdheij, os, jblunck, Zhang, Xiantao Carsten Otte wrote: > +struct mm_struct *dup_mm(struct task_struct *tsk); > No prototypes in .c files. Put this in an appropriate header. J -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-20 17:28 ` Jeremy Fitzhardinge @ 2008-03-20 19:13 ` Dave Hansen 2008-03-20 20:35 ` [kvm-devel] " Carsten Otte 0 siblings, 1 reply; 14+ messages in thread From: Dave Hansen @ 2008-03-20 19:13 UTC (permalink / raw) To: Jeremy Fitzhardinge Cc: Carsten Otte, virtualization, kvm-devel, Avi Kivity, Linux Memory Management List, aliguori, EHRHARDT, arnd, hollisb, heiko.carstens, jeroney, borntraeger, schwidefsky, rvdheij, os, jblunck, Zhang, Xiantao On Thu, 2008-03-20 at 10:28 -0700, Jeremy Fitzhardinge wrote: > Carsten Otte wrote: > > +struct mm_struct *dup_mm(struct task_struct *tsk); > > No prototypes in .c files. Put this in an appropriate header. Well, and more fundamentally: do we really want dup_mm() able to be called from other code? Maybe we need a bit more detailed justification why fork() itself isn't good enough. It looks to me like they basically need an arch-specific argument to fork, telling the new process's page tables to take the fancy new bit. I'm really curious how this new stuff is going to get used. Are you basically replacing fork() when creating kvm guests? -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-20 19:13 ` Dave Hansen @ 2008-03-20 20:35 ` Carsten Otte 2008-03-21 18:29 ` Dave Hansen 0 siblings, 1 reply; 14+ messages in thread From: Carsten Otte @ 2008-03-20 20:35 UTC (permalink / raw) To: Dave Hansen Cc: Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, borntrae, kvm-devel, heicars2, jeroney, Avi Kivity, virtualization, Linux Memory Management List, mschwid2, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Dave Hansen wrote: > Well, and more fundamentally: do we really want dup_mm() able to be > called from other code? > > Maybe we need a bit more detailed justification why fork() itself isn't > good enough. It looks to me like they basically need an arch-specific > argument to fork, telling the new process's page tables to take the > fancy new bit. > > I'm really curious how this new stuff is going to get used. Are you > basically replacing fork() when creating kvm guests? No. The trick is, that we do need bigger page tables when running guests: our page tables are usually 2k, but when running a guest they're 4k to track both guest and host dirty&reference information. This looks like this: *----------* *2k PTE's * *----------* *2k PGSTE * *----------* We don't want to waste precious memory for all page tables. We'd like to have one kernel image that runs regular server workload _and_ guests. Therefore, we need to reallocate the page table after fork() once we know that task is going to be a hypervisor. That's what this code does: reallocate a bigger page table to accomondate the extra information. The task needs to be single-threaded when calling for extended page tables. Btw: at fork() time, we cannot tell whether or not the user's going to be a hypervisor. Therefore we cannot do this in fork. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-20 20:35 ` [kvm-devel] " Carsten Otte @ 2008-03-21 18:29 ` Dave Hansen 2008-03-21 19:03 ` Carsten Otte ` (2 more replies) 0 siblings, 3 replies; 14+ messages in thread From: Dave Hansen @ 2008-03-21 18:29 UTC (permalink / raw) To: carsteno Cc: Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, borntrae, kvm-devel, heicars2, jeroney, Avi Kivity, virtualization, Linux Memory Management List, mschwid2, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao On Thu, 2008-03-20 at 21:35 +0100, Carsten Otte wrote: > Dave Hansen wrote: > > Well, and more fundamentally: do we really want dup_mm() able to be > > called from other code? > > > > Maybe we need a bit more detailed justification why fork() itself isn't > > good enough. It looks to me like they basically need an arch-specific > > argument to fork, telling the new process's page tables to take the > > fancy new bit. > > > > I'm really curious how this new stuff is going to get used. Are you > > basically replacing fork() when creating kvm guests? > No. The trick is, that we do need bigger page tables when running > guests: our page tables are usually 2k, but when running a guest > they're 4k to track both guest and host dirty&reference information. > This looks like this: > *----------* > *2k PTE's * > *----------* > *2k PGSTE * > *----------* > We don't want to waste precious memory for all page tables. We'd like > to have one kernel image that runs regular server workload _and_ > guests. That makes a lot of sense. Is that layout (the shadow and regular stacked together) specified in hardware somehow, or was it just chosen? What you've done with dup_mm() is probably the brute-force way that I would have done it had I just been trying to make a proof of concept or something. I'm worried that there are a bunch of corner cases that haven't been considered. What if someone else is poking around with ptrace or something similar and they bump the mm_users: + if (tsk->mm->context.pgstes) + return 0; + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) + return -EINVAL; -------->HERE + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ + mm = dup_mm(tsk); It'll race, possibly fault in some other pages, and those faults will be lost during the dup_mm(). I think you need to be able to lock out all of the users of access_process_vm() before you go and do this. You also need to make sure that anyone who has looked at task->mm doesn't go and get a reference to it and get confused later when it isn't the task->mm any more. > Therefore, we need to reallocate the page table after fork() > once we know that task is going to be a hypervisor. That's what this > code does: reallocate a bigger page table to accomondate the extra > information. The task needs to be single-threaded when calling for > extended page tables. > > Btw: at fork() time, we cannot tell whether or not the user's going to > be a hypervisor. Therefore we cannot do this in fork. Can you convert the page tables at a later time without doing a wholesale replacement of the mm? It should be a bit easier to keep people off the pagetables than keep their grubby mitts off the mm itself. -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-21 18:29 ` Dave Hansen @ 2008-03-21 19:03 ` Carsten Otte 2008-03-22 17:57 ` Heiko Carstens 2008-03-25 15:37 ` Carsten Otte 2 siblings, 0 replies; 14+ messages in thread From: Carsten Otte @ 2008-03-21 19:03 UTC (permalink / raw) To: Dave Hansen Cc: carsteno, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, borntrae, kvm-devel, heicars2, jeroney, Avi Kivity, virtualization, Linux Memory Management List, mschwid2, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Dave Hansen wrote: > On Thu, 2008-03-20 at 21:35 +0100, Carsten Otte wrote: >> Dave Hansen wrote: >>> Well, and more fundamentally: do we really want dup_mm() able to be >>> called from other code? >>> >>> Maybe we need a bit more detailed justification why fork() itself isn't >>> good enough. It looks to me like they basically need an arch-specific >>> argument to fork, telling the new process's page tables to take the >>> fancy new bit. >>> >>> I'm really curious how this new stuff is going to get used. Are you >>> basically replacing fork() when creating kvm guests? >> No. The trick is, that we do need bigger page tables when running >> guests: our page tables are usually 2k, but when running a guest >> they're 4k to track both guest and host dirty&reference information. >> This looks like this: >> *----------* >> *2k PTE's * >> *----------* >> *2k PGSTE * >> *----------* >> We don't want to waste precious memory for all page tables. We'd like >> to have one kernel image that runs regular server workload _and_ >> guests. > > That makes a lot of sense. > > Is that layout (the shadow and regular stacked together) specified in > hardware somehow, or was it just chosen? It's defined by hardware. The chip just adds +2k to the ptep to get to the corresponding pgste. Both pte and pgste are 64bit per page. I know Heiko and Martin have thought a lot about possible races. I'll have to leave your question on the race against pfault open for them. Btw: thanks a lot for reviewing our changes :-) cheers, Carsten -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-21 18:29 ` Dave Hansen 2008-03-21 19:03 ` Carsten Otte @ 2008-03-22 17:57 ` Heiko Carstens 2008-03-23 10:15 ` Avi Kivity 2008-03-25 15:37 ` Carsten Otte 2 siblings, 1 reply; 14+ messages in thread From: Heiko Carstens @ 2008-03-22 17:57 UTC (permalink / raw) To: Dave Hansen Cc: carsteno, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, kvm-devel, mschwid2, heicars2, jeroney, borntrae, virtualization, Linux Memory Management List, Avi Kivity, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao > What you've done with dup_mm() is probably the brute-force way that I > would have done it had I just been trying to make a proof of concept or > something. I'm worried that there are a bunch of corner cases that > haven't been considered. > > What if someone else is poking around with ptrace or something similar > and they bump the mm_users: > > + if (tsk->mm->context.pgstes) > + return 0; > + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || > + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) > + return -EINVAL; > -------->HERE > + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ > + mm = dup_mm(tsk); > > It'll race, possibly fault in some other pages, and those faults will be > lost during the dup_mm(). I think you need to be able to lock out all > of the users of access_process_vm() before you go and do this. You also > need to make sure that anyone who has looked at task->mm doesn't go and > get a reference to it and get confused later when it isn't the task->mm > any more. > > > Therefore, we need to reallocate the page table after fork() > > once we know that task is going to be a hypervisor. That's what this > > code does: reallocate a bigger page table to accomondate the extra > > information. The task needs to be single-threaded when calling for > > extended page tables. > > > > Btw: at fork() time, we cannot tell whether or not the user's going to > > be a hypervisor. Therefore we cannot do this in fork. > > Can you convert the page tables at a later time without doing a > wholesale replacement of the mm? It should be a bit easier to keep > people off the pagetables than keep their grubby mitts off the mm > itself. Yes, as far as I can see you're right. And whatever we do in arch code, after all it's just a work around to avoid a new clone flag. If something like clone() with CLONE_KVM would be useful for more architectures than just s390 then maybe we should try to get a flag. Oh... there are just two unused clone flag bits left. Looks like the namespace changes ate up a lot of them lately. Well, we could still play dirty tricks like setting a bit in current via whatever mechanism which indicates child-wants-extended-page-tables and then just fork and be happy. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-22 17:57 ` Heiko Carstens @ 2008-03-23 10:15 ` Avi Kivity 2008-03-23 18:23 ` Martin Schwidefsky 0 siblings, 1 reply; 14+ messages in thread From: Avi Kivity @ 2008-03-23 10:15 UTC (permalink / raw) To: Heiko Carstens Cc: Dave Hansen, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, Linux Memory Management List, carsteno, heicars2, mschwid2, jeroney, borntrae, virtualization, kvm-devel, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Heiko Carstens wrote: >> What you've done with dup_mm() is probably the brute-force way that I >> would have done it had I just been trying to make a proof of concept or >> something. I'm worried that there are a bunch of corner cases that >> haven't been considered. >> >> What if someone else is poking around with ptrace or something similar >> and they bump the mm_users: >> >> + if (tsk->mm->context.pgstes) >> + return 0; >> + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || >> + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) >> + return -EINVAL; >> -------->HERE >> + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ >> + mm = dup_mm(tsk); >> >> It'll race, possibly fault in some other pages, and those faults will be >> lost during the dup_mm(). I think you need to be able to lock out all >> of the users of access_process_vm() before you go and do this. You also >> need to make sure that anyone who has looked at task->mm doesn't go and >> get a reference to it and get confused later when it isn't the task->mm >> any more. >> >> >>> Therefore, we need to reallocate the page table after fork() >>> once we know that task is going to be a hypervisor. That's what this >>> code does: reallocate a bigger page table to accomondate the extra >>> information. The task needs to be single-threaded when calling for >>> extended page tables. >>> >>> Btw: at fork() time, we cannot tell whether or not the user's going to >>> be a hypervisor. Therefore we cannot do this in fork. >>> >> Can you convert the page tables at a later time without doing a >> wholesale replacement of the mm? It should be a bit easier to keep >> people off the pagetables than keep their grubby mitts off the mm >> itself. >> > > Yes, as far as I can see you're right. And whatever we do in arch code, > after all it's just a work around to avoid a new clone flag. > If something like clone() with CLONE_KVM would be useful for more > architectures than just s390 then maybe we should try to get a flag. > > Oh... there are just two unused clone flag bits left. Looks like the > namespace changes ate up a lot of them lately. > > Well, we could still play dirty tricks like setting a bit in current > via whatever mechanism which indicates child-wants-extended-page-tables > and then just fork and be happy. > How about taking mmap_sem for write and converting all page tables in-place? I'd rather avoid the need to fork() when creating a VM. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-23 10:15 ` Avi Kivity @ 2008-03-23 18:23 ` Martin Schwidefsky 2008-03-24 6:57 ` Avi Kivity 0 siblings, 1 reply; 14+ messages in thread From: Martin Schwidefsky @ 2008-03-23 18:23 UTC (permalink / raw) To: Avi Kivity Cc: Heiko Carstens, Dave Hansen, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, Linux Memory Management List, carsteno, heicars2, mschwid2, jeroney, borntrae, virtualization, kvm-devel, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao On Sun, 2008-03-23 at 12:15 +0200, Avi Kivity wrote: > >> Can you convert the page tables at a later time without doing a > >> wholesale replacement of the mm? It should be a bit easier to keep > >> people off the pagetables than keep their grubby mitts off the mm > >> itself. > >> > > > > Yes, as far as I can see you're right. And whatever we do in arch code, > > after all it's just a work around to avoid a new clone flag. > > If something like clone() with CLONE_KVM would be useful for more > > architectures than just s390 then maybe we should try to get a flag. > > > > Oh... there are just two unused clone flag bits left. Looks like the > > namespace changes ate up a lot of them lately. > > > > Well, we could still play dirty tricks like setting a bit in current > > via whatever mechanism which indicates child-wants-extended-page-tables > > and then just fork and be happy. > > > > How about taking mmap_sem for write and converting all page tables > in-place? I'd rather avoid the need to fork() when creating a VM. That was my initial approach as well. If all the page table allocations can be fullfilled the code is not too complicated. To handle allocation failures gets tricky. At this point I realized that dup_mmap already does what we want to do. It walks all the page tables, allocates new page tables and copies the ptes. In principle I would reinvent the wheel if we can not use dup_mmap. -- blue skies, Martin. "Reality continues to ruin my life." - Calvin. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-23 18:23 ` Martin Schwidefsky @ 2008-03-24 6:57 ` Avi Kivity 2008-03-25 6:08 ` Carsten Otte 0 siblings, 1 reply; 14+ messages in thread From: Avi Kivity @ 2008-03-24 6:57 UTC (permalink / raw) To: schwidefsky Cc: Heiko Carstens, Dave Hansen, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, Linux Memory Management List, carsteno, heicars2, mschwid2, jeroney, borntrae, virtualization, kvm-devel, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Martin Schwidefsky wrote: > On Sun, 2008-03-23 at 12:15 +0200, Avi Kivity wrote: > >>>> Can you convert the page tables at a later time without doing a >>>> wholesale replacement of the mm? It should be a bit easier to keep >>>> people off the pagetables than keep their grubby mitts off the mm >>>> itself. >>>> >>>> >>> Yes, as far as I can see you're right. And whatever we do in arch code, >>> after all it's just a work around to avoid a new clone flag. >>> If something like clone() with CLONE_KVM would be useful for more >>> architectures than just s390 then maybe we should try to get a flag. >>> >>> Oh... there are just two unused clone flag bits left. Looks like the >>> namespace changes ate up a lot of them lately. >>> >>> Well, we could still play dirty tricks like setting a bit in current >>> via whatever mechanism which indicates child-wants-extended-page-tables >>> and then just fork and be happy. >>> >>> >> How about taking mmap_sem for write and converting all page tables >> in-place? I'd rather avoid the need to fork() when creating a VM. >> > > That was my initial approach as well. If all the page table allocations > can be fullfilled the code is not too complicated. To handle allocation > failures gets tricky. At this point I realized that dup_mmap already > does what we want to do. It walks all the page tables, allocates new > page tables and copies the ptes. In principle I would reinvent the wheel > if we can not use dup_mmap Well, dup_mm() can't work (and now that I think about it, for more reasons -- what if the process has threads?). I don't think conversion is too bad. You'd need a four-level loop to allocate and convert, and another loop to deallocate in case of error. If, as I don't doubt, s390 hardware can modify the ptes, you'd need cmpxchg to read and clear a pte in one operation. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-24 6:57 ` Avi Kivity @ 2008-03-25 6:08 ` Carsten Otte 2008-03-25 6:12 ` Avi Kivity 0 siblings, 1 reply; 14+ messages in thread From: Carsten Otte @ 2008-03-25 6:08 UTC (permalink / raw) To: Avi Kivity Cc: schwidefsky, Heiko Carstens, Dave Hansen, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, Linux Memory Management List, carsteno, heicars2, mschwid2, jeroney, borntrae, virtualization, kvm-devel, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Avi Kivity wrote: > Well, dup_mm() can't work (and now that I think about it, for more > reasons -- what if the process has threads?). We lock out multithreaded users already, -EINVAL. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-25 6:08 ` Carsten Otte @ 2008-03-25 6:12 ` Avi Kivity 0 siblings, 0 replies; 14+ messages in thread From: Avi Kivity @ 2008-03-25 6:12 UTC (permalink / raw) To: carsteno Cc: schwidefsky, Heiko Carstens, Dave Hansen, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, Linux Memory Management List, heicars2, mschwid2, jeroney, borntrae, virtualization, kvm-devel, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Carsten Otte wrote: > Avi Kivity wrote: >> Well, dup_mm() can't work (and now that I think about it, for more >> reasons -- what if the process has threads?). > We lock out multithreaded users already, -EINVAL. > Would be much better if this can be avoided. It's surprising. -- Any sufficiently difficult bug is indistinguishable from a feature. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [kvm-devel] [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable 2008-03-21 18:29 ` Dave Hansen 2008-03-21 19:03 ` Carsten Otte 2008-03-22 17:57 ` Heiko Carstens @ 2008-03-25 15:37 ` Carsten Otte 2 siblings, 0 replies; 14+ messages in thread From: Carsten Otte @ 2008-03-25 15:37 UTC (permalink / raw) To: Dave Hansen Cc: carsteno, Jeremy Fitzhardinge, Christian Ehrhardt, hollisb, arnd, borntrae, kvm-devel, heicars2, jeroney, Avi Kivity, virtualization, Linux Memory Management List, mschwid2, rvdheij, Olaf Schnapper, jblunck, Zhang, Xiantao Am Freitag, den 21.03.2008, 11:29 -0700 schrieb Dave Hansen: > What you've done with dup_mm() is probably the brute-force way that I > would have done it had I just been trying to make a proof of concept or > something. I'm worried that there are a bunch of corner cases that > haven't been considered. > > What if someone else is poking around with ptrace or something similar > and they bump the mm_users: > > + if (tsk->mm->context.pgstes) > + return 0; > + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || > + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) > + return -EINVAL; > -------->HERE > + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ > + mm = dup_mm(tsk); > > It'll race, possibly fault in some other pages, and those faults will be > lost during the dup_mm(). I think you need to be able to lock out all > of the users of access_process_vm() before you go and do this. You also > need to make sure that anyone who has looked at task->mm doesn't go and > get a reference to it and get confused later when it isn't the task->mm > any more. Good catch, Dave. We intend to get rid of that race via task_lock(). That should lock out ptrace and all others who modify mm_users via get_task_mm. See patch below: --- arch/s390/Kconfig | 4 ++ arch/s390/kernel/setup.c | 4 ++ arch/s390/mm/pgtable.c | 65 +++++++++++++++++++++++++++++++++++++++-- include/asm-s390/mmu.h | 1 include/asm-s390/mmu_context.h | 8 ++++- include/asm-s390/pgtable.h | 1 include/linux/sched.h | 2 + kernel/fork.c | 2 - 8 files changed, 82 insertions(+), 5 deletions(-) Index: linux-host/arch/s390/Kconfig =================================================================== --- linux-host.orig/arch/s390/Kconfig +++ linux-host/arch/s390/Kconfig @@ -55,6 +55,10 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT +config PGSTE + bool + default y if KVM + mainmenu "Linux Kernel Configuration" config S390 Index: linux-host/arch/s390/kernel/setup.c =================================================================== --- linux-host.orig/arch/s390/kernel/setup.c +++ linux-host/arch/s390/kernel/setup.c @@ -315,7 +315,11 @@ static int __init early_parse_ipldelay(c early_param("ipldelay", early_parse_ipldelay); #ifdef CONFIG_S390_SWITCH_AMODE +#ifdef CONFIG_PGSTE +unsigned int switch_amode = 1; +#else unsigned int switch_amode = 0; +#endif EXPORT_SYMBOL_GPL(switch_amode); static void set_amode_and_uaccess(unsigned long user_amode, Index: linux-host/arch/s390/mm/pgtable.c =================================================================== --- linux-host.orig/arch/s390/mm/pgtable.c +++ linux-host/arch/s390/mm/pgtable.c @@ -30,11 +30,27 @@ #define TABLES_PER_PAGE 4 #define FRAG_MASK 15UL #define SECOND_HALVES 10UL + +void clear_table_pgstes(unsigned long *table) +{ + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); + memset(table + 256, 0, PAGE_SIZE/4); + clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); + memset(table + 768, 0, PAGE_SIZE/4); +} + #else #define ALLOC_ORDER 2 #define TABLES_PER_PAGE 2 #define FRAG_MASK 3UL #define SECOND_HALVES 2UL + +void clear_table_pgstes(unsigned long *table) +{ + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); + memset(table + 256, 0, PAGE_SIZE/2); +} + #endif unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) @@ -153,7 +169,7 @@ unsigned long *page_table_alloc(struct m unsigned long *table; unsigned long bits; - bits = mm->context.noexec ? 3UL : 1UL; + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; spin_lock(&mm->page_table_lock); page = NULL; if (!list_empty(&mm->context.pgtable_list)) { @@ -170,7 +186,10 @@ unsigned long *page_table_alloc(struct m pgtable_page_ctor(page); page->flags &= ~FRAG_MASK; table = (unsigned long *) page_to_phys(page); - clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); + if (mm->context.pgstes) + clear_table_pgstes(table); + else + clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); spin_lock(&mm->page_table_lock); list_add(&page->lru, &mm->context.pgtable_list); } @@ -191,7 +210,7 @@ void page_table_free(struct mm_struct *m struct page *page; unsigned long bits; - bits = mm->context.noexec ? 3UL : 1UL; + bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); page = pfn_to_page(__pa(table) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); @@ -228,3 +247,43 @@ void disable_noexec(struct mm_struct *mm mm->context.noexec = 0; update_mm(mm, tsk); } + +/* + * switch on pgstes for its userspace process (for kvm) + */ +int s390_enable_sie(void) +{ + struct task_struct *tsk = current; + struct mm_struct *mm; + int rc; + + task_lock(tsk); + + rc = 0; + if (tsk->mm->context.pgstes) + goto unlock; + + rc = -EINVAL; + if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || + tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) + goto unlock; + + tsk->mm->context.pgstes = 1; /* dirty little tricks .. */ + mm = dup_mm(tsk); + tsk->mm->context.pgstes = 0; + + rc = -ENOMEM; + if (!mm) + goto unlock; + mmput(tsk->mm); + tsk->mm = tsk->active_mm = mm; + preempt_disable(); + update_mm(mm, tsk); + cpu_set(smp_processor_id(), mm->cpu_vm_mask); + preempt_enable(); + rc = 0; +unlock: + task_unlock(tsk); + return rc; +} +EXPORT_SYMBOL_GPL(s390_enable_sie); Index: linux-host/include/asm-s390/mmu.h =================================================================== --- linux-host.orig/include/asm-s390/mmu.h +++ linux-host/include/asm-s390/mmu.h @@ -7,6 +7,7 @@ typedef struct { unsigned long asce_bits; unsigned long asce_limit; int noexec; + int pgstes; } mm_context_t; #endif Index: linux-host/include/asm-s390/mmu_context.h =================================================================== --- linux-host.orig/include/asm-s390/mmu_context.h +++ linux-host/include/asm-s390/mmu_context.h @@ -20,7 +20,13 @@ static inline int init_new_context(struc #ifdef CONFIG_64BIT mm->context.asce_bits |= _ASCE_TYPE_REGION3; #endif - mm->context.noexec = s390_noexec; + if (current->mm->context.pgstes) { + mm->context.noexec = 0; + mm->context.pgstes = 1; + } else { + mm->context.noexec = s390_noexec; + mm->context.pgstes = 0; + } mm->context.asce_limit = STACK_TOP_MAX; crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); return 0; Index: linux-host/include/asm-s390/pgtable.h =================================================================== --- linux-host.orig/include/asm-s390/pgtable.h +++ linux-host/include/asm-s390/pgtable.h @@ -966,6 +966,7 @@ static inline pte_t mk_swap_pte(unsigned extern int add_shared_memory(unsigned long start, unsigned long size); extern int remove_shared_memory(unsigned long start, unsigned long size); +extern int s390_enable_sie(void); /* * No page table caches to initialise Index: linux-host/kernel/fork.c =================================================================== --- linux-host.orig/kernel/fork.c +++ linux-host/kernel/fork.c @@ -498,7 +498,7 @@ void mm_release(struct task_struct *tsk, * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; Index: linux-host/include/linux/sched.h =================================================================== --- linux-host.orig/include/linux/sched.h +++ linux-host/include/linux/sched.h @@ -1758,6 +1758,8 @@ extern void mmput(struct mm_struct *); extern struct mm_struct *get_task_mm(struct task_struct *task); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +/* Allocate a new mm structure and copy contents from tsk->mm */ +extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); extern void flush_thread(void); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
* [RFC/PATCH 02/15] preparation: host memory management changes for s390 kvm [not found] <1206028710.6690.21.camel@cotte.boeblingen.de.ibm.com> 2008-03-20 16:24 ` [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable Carsten Otte, Martin Schwidefsky @ 2008-03-20 16:24 ` Carsten Otte, Heiko Carstens, Christian Borntraeger 1 sibling, 0 replies; 14+ messages in thread From: Carsten Otte, Heiko Carstens, Christian Borntraeger @ 2008-03-20 16:24 UTC (permalink / raw) To: virtualization, kvm-devel, Avi Kivity, Linux Memory Management List Cc: schwidefsky, heiko.carstens, os, borntraeger, hollisb, EHRHARDT, jeroney, aliguori, jblunck, rvdheij, rusty, arnd, Zhang, Xiantao This patch changes the s390 memory management defintions to use the pgste field for dirty and reference bit tracking of host and guest code. Usually on s390, dirty and referenced are tracked in storage keys, which belong to the physical page. This changes with virtualization: The guest and host dirty/reference bits are defined to be the logical OR of the values for the mapping and the physical page. This patch implements the necessary changes in pgtable.h for s390. There is a common code change in mm/rmap.c, the call to page_test_and_clear_young must be moved. This is a no-op for all architecture but s390. page_referenced checks the referenced bits for the physiscal page and for all mappings: o The physical page is checked with page_test_and_clear_young. o The mappings are checked with ptep_test_and_clear_young and friends. Without pgstes (the current implementation on Linux s390) the physical page check is implemented but the mapping callbacks are no-ops because dirty and referenced are not tracked in the s390 page tables. The pgstes introduces guest and host dirty and reference bits for s390 in the host mapping. These mapping must be checked before page_test_and_clear_young resets the reference bit. Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com> Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Carsten Otte <cotte@de.ibm.com> --- include/asm-s390/pgtable.h | 109 +++++++++++++++++++++++++++++++++++++++++++-- mm/rmap.c | 7 +- 2 files changed, 110 insertions(+), 6 deletions(-) Index: kvm/include/asm-s390/pgtable.h =================================================================== --- kvm.orig/include/asm-s390/pgtable.h +++ kvm/include/asm-s390/pgtable.h @@ -30,6 +30,7 @@ */ #ifndef __ASSEMBLY__ #include <linux/mm_types.h> +#include <asm/atomic.h> #include <asm/bug.h> #include <asm/processor.h> @@ -258,6 +259,13 @@ extern char empty_zero_page[PAGE_SIZE]; * swap pte is 1011 and 0001, 0011, 0101, 0111 are invalid. */ +/* Page status extended for virtualization */ +#define _PAGE_RCP_PCL 0x0080000000000000UL +#define _PAGE_RCP_HR 0x0040000000000000UL +#define _PAGE_RCP_HC 0x0020000000000000UL +#define _PAGE_RCP_GR 0x0004000000000000UL +#define _PAGE_RCP_GC 0x0002000000000000UL + #ifndef __s390x__ /* Bits in the segment table address-space-control-element */ @@ -513,6 +521,67 @@ static inline int pte_file(pte_t pte) #define __HAVE_ARCH_PTE_SAME #define pte_same(a,b) (pte_val(a) == pte_val(b)) +static inline void rcp_lock(pte_t *ptep) +{ +#ifdef CONFIG_PGSTE + atomic64_t *rcp = (atomic64_t *) (ptep + PTRS_PER_PTE); + preempt_disable(); + atomic64_set_mask(_PAGE_RCP_PCL, rcp); +#endif +} + +static inline void rcp_unlock(pte_t *ptep) +{ +#ifdef CONFIG_PGSTE + atomic64_t *rcp = (atomic64_t *) (ptep + PTRS_PER_PTE); + atomic64_clear_mask(_PAGE_RCP_PCL, rcp); + preempt_enable(); +#endif +} + +static inline void rcp_set_bits(pte_t *ptep, unsigned long val) +{ +#ifdef CONFIG_PGSTE + *(unsigned long *) (ptep + PTRS_PER_PTE) |= val; +#endif +} + +static inline int rcp_test_and_clear_bits(pte_t *ptep, unsigned long val) +{ +#ifdef CONFIG_PGSTE + unsigned long ret; + + ret = *(unsigned long *) (ptep + PTRS_PER_PTE); + *(unsigned long *) (ptep + PTRS_PER_PTE) &= ~val; + return (ret & val) == val; +#else + return 0; +#endif +} + + +/* forward declaration for SetPageUptodate in page-flags.h*/ +static inline void page_clear_dirty(struct page *page); +#include <linux/page-flags.h> + +static inline void ptep_rcp_copy(pte_t *ptep) +{ +#ifdef CONFIG_PGSTE + struct page *page = virt_to_page(pte_val(*ptep)); + unsigned int skey; + + skey = page_get_storage_key(page_to_phys(page)); + if (skey & _PAGE_CHANGED) + rcp_set_bits(ptep, _PAGE_RCP_GC); + if (skey & _PAGE_REFERENCED) + rcp_set_bits(ptep, _PAGE_RCP_GR); + if (rcp_test_and_clear_bits(ptep, _PAGE_RCP_HC)) + SetPageDirty(page); + if (rcp_test_and_clear_bits(ptep, _PAGE_RCP_HR)) + SetPageReferenced(page); +#endif +} + /* * query functions pte_write/pte_dirty/pte_young only work if * pte_present() is true. Undefined behaviour if not.. @@ -599,6 +668,8 @@ static inline void pmd_clear(pmd_t *pmd) static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + if (mm->context.pgstes) + ptep_rcp_copy(ptep); pte_val(*ptep) = _PAGE_TYPE_EMPTY; if (mm->context.noexec) pte_val(ptep[PTRS_PER_PTE]) = _PAGE_TYPE_EMPTY; @@ -667,6 +738,22 @@ static inline pte_t pte_mkyoung(pte_t pt static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_PGSTE + unsigned long physpage; + int young; + + if (!vma->vm_mm->context.pgstes) + return 0; + physpage = pte_val(*ptep) & PAGE_MASK; + + young = ((page_get_storage_key(physpage) & _PAGE_REFERENCED) != 0); + rcp_lock(ptep); + if (young) + rcp_set_bits(ptep, _PAGE_RCP_GR); + young |= rcp_test_and_clear_bits(ptep, _PAGE_RCP_HR); + rcp_unlock(ptep); + return young; +#endif return 0; } @@ -674,7 +761,13 @@ static inline int ptep_test_and_clear_yo static inline int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - /* No need to flush TLB; bits are in storage key */ + /* No need to flush TLB + * On s390 reference bits are in storage key and never in TLB + * With virtualization we handle the reference bit, without we + * we can simply return */ +#ifdef CONFIG_PGSTE + return ptep_test_and_clear_young(vma, address, ptep); +#endif return 0; } @@ -693,15 +786,25 @@ static inline void __ptep_ipte(unsigned : "=m" (*ptep) : "m" (*ptep), "a" (pto), "a" (address)); } - pte_val(*ptep) = _PAGE_TYPE_EMPTY; } static inline void ptep_invalidate(struct mm_struct *mm, unsigned long address, pte_t *ptep) { + if (mm->context.pgstes) { + rcp_lock(ptep); + __ptep_ipte(address, ptep); + ptep_rcp_copy(ptep); + pte_val(*ptep) = _PAGE_TYPE_EMPTY; + rcp_unlock(ptep); + return; + } __ptep_ipte(address, ptep); - if (mm->context.noexec) + pte_val(*ptep) = _PAGE_TYPE_EMPTY; + if (mm->context.noexec) { __ptep_ipte(address, ptep + PTRS_PER_PTE); + pte_val(*(ptep + PTRS_PER_PTE)) = _PAGE_TYPE_EMPTY; + } } /* Index: kvm/mm/rmap.c =================================================================== --- kvm.orig/mm/rmap.c +++ kvm/mm/rmap.c @@ -411,9 +411,6 @@ int page_referenced(struct page *page, i { int referenced = 0; - if (page_test_and_clear_young(page)) - referenced++; - if (TestClearPageReferenced(page)) referenced++; @@ -431,6 +428,10 @@ int page_referenced(struct page *page, i unlock_page(page); } } + + if (page_test_and_clear_young(page)) + referenced++; + return referenced; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2008-03-25 15:37 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <1206028710.6690.21.camel@cotte.boeblingen.de.ibm.com>
2008-03-20 16:24 ` [RFC/PATCH 01/15] preparation: provide hook to enable pgstes in user pagetable Carsten Otte, Martin Schwidefsky
2008-03-20 17:28 ` Jeremy Fitzhardinge
2008-03-20 19:13 ` Dave Hansen
2008-03-20 20:35 ` [kvm-devel] " Carsten Otte
2008-03-21 18:29 ` Dave Hansen
2008-03-21 19:03 ` Carsten Otte
2008-03-22 17:57 ` Heiko Carstens
2008-03-23 10:15 ` Avi Kivity
2008-03-23 18:23 ` Martin Schwidefsky
2008-03-24 6:57 ` Avi Kivity
2008-03-25 6:08 ` Carsten Otte
2008-03-25 6:12 ` Avi Kivity
2008-03-25 15:37 ` Carsten Otte
2008-03-20 16:24 ` [RFC/PATCH 02/15] preparation: host memory management changes for s390 kvm Carsten Otte, Heiko Carstens, Christian Borntraeger
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox