This patch allows a process's stack to be backed by huge pages on request. As the stack is setup at exec() time, a personality flag is added to indicate the use of a hugepage-backed stack. The personality flag is inherited across exec(). Huge page stacks require stack randomization to be disabled because huge ptes are not movable, so the HUGE_PAGE_STACK personality flag implies ADDR_NO_RANDOMIZE. When the hugetlb file is setup to back the stack, it is sized to fit the ulimit for stack size or 256 MB if ulimit is unlimited. The GROWSUP and GROWSDOWN VM flags are turned off because a hugetlb backed vma is not resizable, so it will be appropriately sized when created. When a process exceeds stack size it recieves a segfault exactly as it would if it exceeded the ulimit. Based on 2.6.25 Signed-off-by: Eric Munson --- fs/exec.c | 87 ++++++++++++++++++++++++++++++++++++++---- include/linux/personality.h | 3 + 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b152029..d38ddf0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -60,6 +61,8 @@ #include #endif +#define MB (1024*1024) + int core_uses_pid; char core_pattern[CORENAME_MAX_SIZE] = "core"; int suid_dumpable = 0; @@ -152,6 +155,13 @@ exit: goto out; } +static unsigned long personality_page_align(unsigned long addr) +{ + if (get_personality & HUGE_PAGE_STACK) + return HPAGE_ALIGN(addr); + return PAGE_ALIGN(addr); +} + #ifdef CONFIG_MMU static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, @@ -173,7 +183,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; if (write) { - unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + /* + * Args are always placed at the high end of the stack space + * so this calculation will give the proper size and it is + * compatible with huge page stacks. + */ + unsigned long size = bprm->vma->vm_end - pos; struct rlimit *rlim; /* @@ -219,16 +234,57 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } +static struct file *hugetlb_stack_file(int stack_hpages) +{ + struct file *hugefile = NULL; + + if (!stack_hpages) { + set_personality(get_personality & (~HUGE_PAGE_STACK)); + printk(KERN_DEBUG + "Stack rlimit set too low for huge page backed stack.\n"); + return NULL; + } + + hugefile = hugetlb_file_setup(HUGETLB_STACK_FILE, + HPAGE_SIZE * stack_hpages, 0); + if (unlikely(IS_ERR_VALUE(hugefile))) { + /* + * If huge pages are not available for this stack fall + * fall back to normal pages for execution instead of + * failing. + */ + printk(KERN_DEBUG + "Huge page backed stack unavailable for process %lu.\n", + (unsigned long)current->pid); + set_personality(get_personality & (~HUGE_PAGE_STACK)); + return NULL; + } + return hugefile; +} + static int __bprm_mm_init(struct linux_binprm *bprm) { int err = -ENOMEM; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; + struct file *hugefile = NULL; + struct rlimit *rlim; + int stack_hpages = 0; bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) goto err; + if (get_personality & HUGE_PAGE_STACK) { + rlim = current->signal->rlim; + if (rlim[RLIMIT_STACK].rlim_cur == _STK_LIM_MAX) + stack_hpages = (256 * MB) / HPAGE_SIZE; + else + stack_hpages = rlim[RLIMIT_STACK].rlim_cur / HPAGE_SIZE; + + hugefile = hugetlb_stack_file(stack_hpages); + } + down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -239,9 +295,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * configured yet. */ vma->vm_end = STACK_TOP_MAX; - vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS; + + if (hugefile) { + vma->vm_flags &= ~(VM_GROWSUP|VM_GROWSDOWN); + vma->vm_file = hugefile; + vma->vm_flags |= VM_HUGETLB; + /* Stack randomization is not supported on huge pages */ + set_personality(get_personality | ADDR_NO_RANDOMIZE); + vma->vm_start = vma->vm_end - (HPAGE_SIZE * stack_hpages); + } else { + vma->vm_start = vma->vm_end - PAGE_SIZE; + } + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) { @@ -593,13 +660,12 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); - stack_top = PAGE_ALIGN(stack_top); + stack_top = personality_page_align(stack_top); stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif - if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; @@ -633,14 +699,17 @@ int setup_arg_pages(struct linux_binprm *bprm, } } + if (!(get_personality & HUGE_PAGE_STACK)) { #ifdef CONFIG_STACK_GROWSUP - stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE; #else - stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; + stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE; #endif - ret = expand_stack(vma, stack_base); - if (ret) - ret = -EFAULT; + + ret = expand_stack(vma, stack_base); + if (ret) + ret = -EFAULT; + } out_unlock: up_write(&mm->mmap_sem); diff --git a/include/linux/personality.h b/include/linux/personality.h index 012cd55..6ecebdf 100644 --- a/include/linux/personality.h +++ b/include/linux/personality.h @@ -22,6 +22,9 @@ extern int __set_personality(unsigned long); * These occupy the top three bytes. */ enum { + HUGE_PAGE_STACK = 0x0020000, /* Attempt to use a huge page + * for the process stack + */ ADDR_NO_RANDOMIZE = 0x0040000, /* disable randomization of VA space */ FDPIC_FUNCPTRS = 0x0080000, /* userspace function ptrs point to descriptors * (signal handling)