linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 2/2] Add huge page backed stack support
@ 2008-05-02  1:51 Eric B Munson
  2008-05-02 17:11 ` Dave Hansen
  2008-05-02 17:15 ` Dave Hansen
  0 siblings, 2 replies; 6+ messages in thread
From: Eric B Munson @ 2008-05-02  1:51 UTC (permalink / raw)
  To: linux-mm; +Cc: nacc, mel, andyw

[-- Attachment #1: Type: text/plain, Size: 6364 bytes --]

This patch allows a process's stack to be backed by huge pages on request. As
the stack is setup at exec() time, a personality flag is added to indicate 
the use of a hugepage-backed stack. The personality flag is inherited across 
exec().

Huge page stacks require stack randomization to be disabled because huge
ptes are not movable, so the HUGE_PAGE_STACK personality flag implies
ADDR_NO_RANDOMIZE.  When the hugetlb file is setup to back the stack, it is
sized to fit the ulimit for stack size or 256 MB if ulimit is unlimited.
The GROWSUP and GROWSDOWN VM flags are turned off because a hugetlb backed
vma is not resizable, so it will be appropriately sized when created.  When
a process exceeds stack size it recieves a segfault exactly as it would if it
exceeded the ulimit.

Based on 2.6.25

Signed-off-by: Eric Munson <ebmunson@us.ibm.com>

---

 fs/exec.c                   |   87 ++++++++++++++++++++++++++++++++++++++----
 include/linux/personality.h |    3 +
 2 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index b152029..d38ddf0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/hugetlb.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -60,6 +61,8 @@
 #include <linux/kmod.h>
 #endif
 
+#define MB (1024*1024)
+
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -152,6 +155,13 @@ exit:
 	goto out;
 }
 
+static unsigned long personality_page_align(unsigned long addr)
+{
+	if (get_personality & HUGE_PAGE_STACK)
+		return HPAGE_ALIGN(addr);
+	return PAGE_ALIGN(addr);
+}
+
 #ifdef CONFIG_MMU
 
 static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
@@ -173,7 +183,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 		return NULL;
 
 	if (write) {
-		unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
+		/*
+		 * Args are always placed at the high end of the stack space
+		 * so this calculation will give the proper size and it is
+		 * compatible with huge page stacks.
+		 */
+		unsigned long size = bprm->vma->vm_end - pos;
 		struct rlimit *rlim;
 
 		/*
@@ -219,16 +234,57 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
 }
 
+static struct file *hugetlb_stack_file(int stack_hpages)
+{
+	struct file *hugefile = NULL;
+
+	if (!stack_hpages) {
+		set_personality(get_personality & (~HUGE_PAGE_STACK));
+		printk(KERN_DEBUG
+			"Stack rlimit set too low for huge page backed stack.\n");
+		return NULL;
+	}
+
+	hugefile = hugetlb_file_setup(HUGETLB_STACK_FILE,
+					HPAGE_SIZE * stack_hpages, 0);
+	if (unlikely(IS_ERR_VALUE(hugefile))) {
+		/*
+		 * If huge pages are not available for this stack fall
+		 * fall back to normal pages for execution instead of
+		 * failing.
+		 */
+		printk(KERN_DEBUG
+			"Huge page backed stack unavailable for process %lu.\n",
+			(unsigned long)current->pid);
+		set_personality(get_personality & (~HUGE_PAGE_STACK));
+		return NULL;
+	}
+	return hugefile;
+}
+
 static int __bprm_mm_init(struct linux_binprm *bprm)
 {
 	int err = -ENOMEM;
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
+	struct file *hugefile = NULL;
+	struct rlimit *rlim;
+	int stack_hpages = 0;
 
 	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma)
 		goto err;
 
+	if (get_personality & HUGE_PAGE_STACK) {
+		rlim = current->signal->rlim;
+		if (rlim[RLIMIT_STACK].rlim_cur == _STK_LIM_MAX)
+			stack_hpages = (256 * MB) / HPAGE_SIZE;
+		else
+			stack_hpages = rlim[RLIMIT_STACK].rlim_cur / HPAGE_SIZE;
+
+		hugefile = hugetlb_stack_file(stack_hpages);
+	}
+
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
 
@@ -239,9 +295,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	 * configured yet.
 	 */
 	vma->vm_end = STACK_TOP_MAX;
-	vma->vm_start = vma->vm_end - PAGE_SIZE;
 
 	vma->vm_flags = VM_STACK_FLAGS;
+
+	if (hugefile) {
+		vma->vm_flags &= ~(VM_GROWSUP|VM_GROWSDOWN);
+		vma->vm_file = hugefile;
+		vma->vm_flags |= VM_HUGETLB;
+		/* Stack randomization is not supported on huge pages */
+		set_personality(get_personality | ADDR_NO_RANDOMIZE);
+		vma->vm_start = vma->vm_end - (HPAGE_SIZE * stack_hpages);
+	} else {
+		vma->vm_start = vma->vm_end - PAGE_SIZE;
+	}
+
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	err = insert_vm_struct(mm, vma);
 	if (err) {
@@ -593,13 +660,12 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	bprm->p = vma->vm_end - stack_shift;
 #else
 	stack_top = arch_align_stack(stack_top);
-	stack_top = PAGE_ALIGN(stack_top);
+	stack_top = personality_page_align(stack_top);
 	stack_shift = vma->vm_end - stack_top;
 
 	bprm->p -= stack_shift;
 	mm->arg_start = bprm->p;
 #endif
-
 	if (bprm->loader)
 		bprm->loader -= stack_shift;
 	bprm->exec -= stack_shift;
@@ -633,14 +699,17 @@ int setup_arg_pages(struct linux_binprm *bprm,
 		}
 	}
 
+	if (!(get_personality & HUGE_PAGE_STACK)) {
 #ifdef CONFIG_STACK_GROWSUP
-	stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+		stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 #else
-	stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+		stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
 #endif
-	ret = expand_stack(vma, stack_base);
-	if (ret)
-		ret = -EFAULT;
+
+		ret = expand_stack(vma, stack_base);
+		if (ret)
+			ret = -EFAULT;
+	}
 
 out_unlock:
 	up_write(&mm->mmap_sem);
diff --git a/include/linux/personality.h b/include/linux/personality.h
index 012cd55..6ecebdf 100644
--- a/include/linux/personality.h
+++ b/include/linux/personality.h
@@ -22,6 +22,9 @@ extern int		__set_personality(unsigned long);
  * These occupy the top three bytes.
  */
 enum {
+	HUGE_PAGE_STACK = 	0x0020000,	/* Attempt to use a huge page
+						 * for the process stack
+						 */
 	ADDR_NO_RANDOMIZE = 	0x0040000,	/* disable randomization of VA space */
 	FDPIC_FUNCPTRS =	0x0080000,	/* userspace function ptrs point to descriptors
 						 * (signal handling)


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH 2/2] Add huge page backed stack support
  2008-05-02  1:51 [RFC][PATCH 2/2] Add huge page backed stack support Eric B Munson
@ 2008-05-02 17:11 ` Dave Hansen
  2008-05-02 17:20   ` Dave Hansen
  2008-05-02 17:15 ` Dave Hansen
  1 sibling, 1 reply; 6+ messages in thread
From: Dave Hansen @ 2008-05-02 17:11 UTC (permalink / raw)
  To: ebmunson; +Cc: linux-mm, nacc, mel, andyw

On Thu, 2008-05-01 at 18:51 -0700, Eric B Munson wrote:
> 
> +       if (!(get_personality & HUGE_PAGE_STACK)) {
>  #ifdef CONFIG_STACK_GROWSUP
> -       stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
> +               stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
>  #else
> -       stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
> +               stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
>  #endif
> -       ret = expand_stack(vma, stack_base);
> -       if (ret)
> -               ret = -EFAULT;
> +
> +               ret = expand_stack(vma, stack_base);
> +               if (ret)
> +                       ret = -EFAULT;
> +       }

Why don't huge page stacks need to be expanded like this?  With a large
EXTRA_STACK_VM_PAGES, you would surely need this, right?

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH 2/2] Add huge page backed stack support
  2008-05-02  1:51 [RFC][PATCH 2/2] Add huge page backed stack support Eric B Munson
  2008-05-02 17:11 ` Dave Hansen
@ 2008-05-02 17:15 ` Dave Hansen
  2008-05-02 21:44   ` Eric B Munson
  1 sibling, 1 reply; 6+ messages in thread
From: Dave Hansen @ 2008-05-02 17:15 UTC (permalink / raw)
  To: ebmunson; +Cc: linux-mm, nacc, mel, andyw

On Thu, 2008-05-01 at 18:51 -0700, Eric B Munson wrote
> The GROWSUP and GROWSDOWN VM flags are turned off because a hugetlb backed
> vma is not resizable, so it will be appropriately sized when created.  When
> a process exceeds stack size it recieves a segfault exactly as it would if it
> exceeded the ulimit.

This one is *really* subtle.  The segfault might behave like breaking a
ulimit.  But, unlike a ulimit, you can't really work around this
particular limitation very easily.

This will really suck for anyone that tries to use 64k huge pages on
powerpc, right?

Are you actually looking to get this included, or are you just trying to
play with this?  It is useful as a toy as-is, but I think you should
look at fixing stack growing before it gets merged anywhere.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH 2/2] Add huge page backed stack support
  2008-05-02 17:11 ` Dave Hansen
@ 2008-05-02 17:20   ` Dave Hansen
  2008-05-02 21:52     ` Eric B Munson
  0 siblings, 1 reply; 6+ messages in thread
From: Dave Hansen @ 2008-05-02 17:20 UTC (permalink / raw)
  To: ebmunson; +Cc: linux-mm, nacc, mel, andyw

On Fri, 2008-05-02 at 10:11 -0700, Dave Hansen wrote:
> Why don't huge page stacks need to be expanded like this?  With a large
> EXTRA_STACK_VM_PAGES, you would surely need this, right?

Never mind.  You don't expand stacks.  This one is probably worth a
comment.

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH 2/2] Add huge page backed stack support
  2008-05-02 17:15 ` Dave Hansen
@ 2008-05-02 21:44   ` Eric B Munson
  0 siblings, 0 replies; 6+ messages in thread
From: Eric B Munson @ 2008-05-02 21:44 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-mm, nacc, mel, andyw

[-- Attachment #1: Type: text/plain, Size: 1759 bytes --]

On Fri, 2008-05-02 at 10:15 -0700, Dave Hansen wrote:
> On Thu, 2008-05-01 at 18:51 -0700, Eric B Munson wrote
> > The GROWSUP and GROWSDOWN VM flags are turned off because a hugetlb backed
> > vma is not resizable, so it will be appropriately sized when created.  When
> > a process exceeds stack size it recieves a segfault exactly as it would if it
> > exceeded the ulimit.
> 
> This one is *really* subtle.  The segfault might behave like breaking a
> ulimit.  But, unlike a ulimit, you can't really work around this
> particular limitation very easily.

I must have not articulated the way things are working well enough.  The
vma that is created for the process stack is sized to hold ulimit /
HPAGE_SIZE huge pages if ulimit is not unlimited.  If ulimit is
unlimited it holds 256MB / HPAGE_SIZE pages.  256MB was picked because
it is a decent comprimise between large stacks and leaving some of a 32
bit address space available.  The segfault is as easily solved as
adjusting the ulimit for stack size.  If ulimit is raised the stack vma
will be bigger to match.  So it does behave exactly as base page stacks
would when you exceed the ulimit for stack size.

> 
> This will really suck for anyone that tries to use 64k huge pages on
> powerpc, right?

Can you expand on this some, I am not sure what you are getting at.

> 
> Are you actually looking to get this included, or are you just trying to
> play with this?  It is useful as a toy as-is, but I think you should
> look at fixing stack growing before it gets merged anywhere.

I am looking for comments and eventually to be merged.  What would take
to get something along this idea merged?  Is anyone completely opposed,
and if so why?

> 
> -- Dave
> 


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH 2/2] Add huge page backed stack support
  2008-05-02 17:20   ` Dave Hansen
@ 2008-05-02 21:52     ` Eric B Munson
  0 siblings, 0 replies; 6+ messages in thread
From: Eric B Munson @ 2008-05-02 21:52 UTC (permalink / raw)
  To: Dave Hansen; +Cc: linux-mm, nacc, mel, andyw

[-- Attachment #1: Type: text/plain, Size: 407 bytes --]

On Fri, 2008-05-02 at 10:20 -0700, Dave Hansen wrote:
> On Fri, 2008-05-02 at 10:11 -0700, Dave Hansen wrote:
> > Why don't huge page stacks need to be expanded like this?  With a large
> > EXTRA_STACK_VM_PAGES, you would surely need this, right?
> 
> Never mind.  You don't expand stacks.  This one is probably worth a
> comment.

Okay, I will add one for the next version.

> 
> -- Dave
> 


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-05-02 21:52 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-05-02  1:51 [RFC][PATCH 2/2] Add huge page backed stack support Eric B Munson
2008-05-02 17:11 ` Dave Hansen
2008-05-02 17:20   ` Dave Hansen
2008-05-02 21:52     ` Eric B Munson
2008-05-02 17:15 ` Dave Hansen
2008-05-02 21:44   ` Eric B Munson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox