diff -Naur linux-2.5.46/kernel/ksyms.c linux-2.5.46-ioe/kernel/ksyms.c --- linux-2.5.46/kernel/ksyms.c Tue Nov 5 21:32:53 2002 +++ linux-2.5.46-ioe/kernel/ksyms.c Thu Nov 7 09:30:15 2002 @@ -134,6 +134,8 @@ EXPORT_SYMBOL(page_address); #endif EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_one_user_page); +EXPORT_SYMBOL_GPL(walk_user_pages); /* filesystem internal functions */ EXPORT_SYMBOL(def_blk_fops); diff -Naur linux-2.5.46/include/linux/mm.h linux-2.5.46-ioe/include/linux/mm.h --- linux-2.5.46/include/linux/mm.h Tue Nov 5 21:32:53 2002 +++ linux-2.5.46-ioe/include/linux/mm.h Tue Nov 5 21:43:26 2002 @@ -373,9 +373,47 @@ extern int sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); +/*** Page walking API ***/ + +/* &custom_page_walker - A custom page walk handler for walk_user_pages(). + * vma: The vma we walk pages of. + * page: The page we found or an %ERR_PTR() value + * virt_addr: The virtual address we are at while walking. + * customdata: Anything you would like to pass additionally. + * + * Returns: + * Negative values -> ERRNO values. + * 0 -> continue page walking. + * 1 -> abort page walking. + * + * If this functions gets a page, for which %IS_ERR(@page) is true, than it + * should do it's cleanup of customdata and return -PTR_ERR(@page). + * + * This function is called with @vma->vm_mm->page_table_lock held, + * if IS_ERR(@vma) is not true. + * + * But if IS_ERR(@vma) is true, IS_ERR(@page) is also true, since if we have no + * vma, then we also have no user space page. + * + * If it returns a negative value, then the page_table_lock must be dropped + * by this function, if it is held. + */ +typedef int (*custom_page_walker_t)(struct vm_area_struct *vma, + struct page *page, unsigned long virt_addr, void *customdata); + extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +struct page *get_one_user_page(struct task_struct *tsk, + struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, int write, int force); + +int walk_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int write, int force, + custom_page_walker_t walker, void *customdata); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); diff -Naur linux-2.5.46/mm/memory.c linux-2.5.46-ioe/mm/memory.c --- linux-2.5.46/mm/memory.c Tue Nov 5 21:32:53 2002 +++ linux-2.5.46-ioe/mm/memory.c Tue Nov 5 22:43:38 2002 @@ -35,6 +35,10 @@ * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) */ +/* 04.11.02 - Page walker API added by Ingo Oeser + * + * Thanks go to Andrew Morton for his intial idea and general help. + */ #include #include @@ -516,20 +520,205 @@ * it? This may become more complex in the future if we start dealing * with IO-aperture pages for direct-IO. */ - static inline struct page *get_page_map(struct page *page) { if (!pfn_valid(page_to_pfn(page))) - return 0; + return ERR_PTR(EFAULT); return page; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +/* Simple page walk handler adding pages to a list of them */ +struct gup_add_pages { + unsigned int count; + unsigned int max_pages; + struct page **pages; +}; + +static inline void gup_pages_cleanup(struct gup_add_pages *gup) +{ + while (gup->count--) { + page_cache_release(gup->pages[gup->count]); + } +} + +/* Follows custom_page_walker API description */ +static int gup_add_pages(struct vm_area_struct *vma, struct page *page, + unsigned long virt_addr, void *customdata) +{ + + struct gup_add_pages *gup = customdata; + + BUG_ON(!customdata); + + if (!IS_ERR(page)) { + gup->pages[gup->count++] = page; + flush_dcache_page(page); + if (!PageReserved(page)) + page_cache_get(page); + + /* Abort if we cannot hold more pages */ + return (gup->count == gup->max_pages) ? 1 : 0; + } + + if (!IS_ERR(vma)) + spin_unlock(&vma->vm_mm->page_table_lock); + gup_pages_cleanup(gup); + return -PTR_ERR(page); +} + +#define OBSOLETE_PAGE_WALKER +#ifdef OBSOLETE_PAGE_WALKER +/* Obsolete page walk handler adding pages and vmas to a list of them */ +struct gup_add_pv { + unsigned int page_count; + unsigned int max_pages; + struct page **pages; + unsigned int vma_count; + unsigned int max_vmas; + struct vm_area_struct **vmas; +}; + +static inline void gup_pv_cleanup(struct gup_add_pv *gup) +{ + while (gup->page_count--) { + page_cache_release(gup->pages[gup->page_count]); + } +} + +/* Follows custom_page_walker API description */ +static int gup_add_pv(struct vm_area_struct *vma, struct page *page, + unsigned long virt_addr, void *customdata) +{ + + struct gup_add_pv *gup = customdata; + int ret = 0; + + BUG_ON(!customdata); + + if (!IS_ERR(page)) { + if (gup->vmas) { + /* Add vma only, if its a new one. Since we walk them + * uniquely, this simple check is enough. -ioe + */ + if (!gup->vma_count + || gup->vmas[gup->vma_count - 1] != vma) { + gup->vmas[gup->vma_count++] = vma; + + /* Abort scanning, if we cannot hold more */ + if (gup->vma_count == gup->max_vmas) + ret = 1; + } + } + + if (gup->pages) { + gup->pages[gup->page_count++] = page; + flush_dcache_page(page); + if (!PageReserved(page)) + page_cache_get(page); + + /* Abort scanning, if we cannot hold more */ + if (gup->page_count == gup->max_pages) + ret = 1; + } + return ret; + } + + if (!IS_ERR(vma)) + spin_unlock(&vma->vm_mm->page_table_lock); + gup_pv_cleanup(gup); + return -PTR_ERR(page); +} +#endif /* OBSOLETE_PAGE_WALKER */ + +/* Try to fault in the page at START. Returns valid page or ERR_PTR(). + * + * called with mm->page_table_lock held + */ +static struct page *single_page_walk(struct task_struct *tsk, + struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, int write) +{ + struct page *map; + + while (!(map = follow_page(mm, start, write))) { + int fault; + + spin_unlock(&mm->page_table_lock); + fault = handle_mm_fault(mm, vma, start, write); + spin_lock(&mm->page_table_lock); + + switch (fault) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + return ERR_PTR(EFAULT); + case VM_FAULT_OOM: + return ERR_PTR(ENOMEM); + default: + /* FIXME Is this unlock better or worse here? -ioe */ + spin_unlock(&mm->page_table_lock); + BUG(); + } + } + return get_page_map(map); +} + +/* VMA contains already "start". + * (e.g. find_vma_extend(mm,start) has been called sucessfully already + */ +struct page *get_one_user_page(struct task_struct *tsk, + struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, int write, int force) +{ + unsigned int flags; + struct page *page; + + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + if (!vma || (vma->vm_flags & VM_IO) || !(flags & vma->vm_flags)) + return ERR_PTR(EFAULT); + + /* FIXME: These are not handled properly, yet. -ioe */ + /* + if (is_vm_hugetlb_page(vma)) { + int len = 1; + int i; + i = follow_hugetlb_page(mm, vma, &page, NULL, + &start, &len, 0); + return (i == 1) ? page : ERR_PTR(EFAULT); + } + */ + + spin_lock(&mm->page_table_lock); + page = single_page_walk(tsk, mm, vma, start, write); + + if (!(IS_ERR(page) || PageReserved(page))) + page_cache_get(page); + + spin_unlock(&mm->page_table_lock); + return page; +} + +#ifdef CONFIG_HUGETLB_PAGE +#error This code is not suitable for huge pages yet. +#endif + +/* Returns 0 or negative errno value */ +int walk_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int write, int force, + custom_page_walker_t walker, void *customdata) { - int i; unsigned int flags; /* @@ -538,66 +727,103 @@ */ flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - i = 0; do { - struct vm_area_struct * vma; + struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma || (pages && (vma->vm_flags & VM_IO)) - || !(flags & vma->vm_flags)) - return i ? : -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i); - continue; - } - spin_lock(&mm->page_table_lock); - do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { - spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm,vma,start,write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - return i ? i : -EFAULT; - case VM_FAULT_OOM: - return i ? i : -ENOMEM; - default: - BUG(); - } - spin_lock(&mm->page_table_lock); - } - if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { - spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); - i = -EFAULT; - goto out; - } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - } while(len && start < vma->vm_end); + if (!vma || (vma->vm_flags & VM_IO) + || !(flags & vma->vm_flags)) + return walker(ERR_PTR(EFAULT), ERR_PTR(EFAULT), start, + customdata); + + /* FIXME: These are not handled, yet. -ioe */ + /* + if (is_vm_hugetlb_page(vma)) { + int i=0; + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i); + continue; + } + */ + spin_lock(&mm->page_table_lock); + do { + int ret; + struct page *page; + page = single_page_walk(tsk, mm, vma, start, write); + ret = walker(vma, page, start, customdata); + switch (ret) { + /* Common case -> continue walking. */ + case 0: + break; + + /* We are satisfied with our walking. */ + case 1: + ret = 0; + spin_unlock(&mm->page_table_lock); + /* Fall trough now */ + + /* Bail out because of error. */ + default: + /* Error cases do unlock, + * if necessary. -ioe */ + return ret; + } + start += PAGE_SIZE; + } while (start < vma->vm_end); spin_unlock(&mm->page_table_lock); - } while(len); -out: - return i; + } while(1); + + /* We will never reach this code, but this makes GCC happy */ + return 0; +} + + +/* Ugly for now, but the defines and the union will go later. -ioe */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int ret; + custom_page_walker_t walker = gup_add_pages; + union { + struct gup_add_pages pg; +#ifdef OBSOLETE_PAGE_WALKER + struct gup_add_pv pv; +#endif + } gup_u; + + memset(&gup_u, 0, sizeof (gup_u)); + +#ifdef OBSOLETE_PAGE_WALKER + if (vmas) { + gup_u.pv.vmas = vmas; + gup_u.pv.max_vmas = len; + walker = gup_add_pv; + printk("Obsolete argument \"vmas\" used!" + " Please send this report to linux-mm@vger.kernel.org" + " or fix the caller. Stack trace follows...\n"); + WARN_ON(vmas); + } +#else + /* FIXME: Or should we simply ignore it? -ioe */ + BUG_ON(vmas); +#endif + + /* Warn on non-sense calls, but process them. -ioe */ + WARN_ON(!vmas && !pages); + + if (pages) { + gup_u.pg.max_pages = len; + gup_u.pg.pages = pages; + } + + ret = walk_user_pages(tsk, mm, start, write, force, walker, &gup_u); + if (ret == 0) { + ret = gup_u.pg.count; + } + return ret; } static void zeromap_pte_range(pte_t * pte, unsigned long address, @@ -1309,10 +1535,29 @@ return pmd_offset(pgd, address); } + +/* A page walker, which just counts down how many pages it got */ +static int gup_mk_present(struct vm_area_struct *vma, struct page *page, + unsigned long virt_addr, void *customdata) +{ + + int *todo = customdata; + + if (!IS_ERR(page)) { + (*todo)--; + /* Abort if have made all required pages present */ + return (*todo) ? 0 : 1; + } + + if (!IS_ERR(vma)) + spin_unlock(&vma->vm_mm->page_table_lock); + return -PTR_ERR(page); +} + int make_pages_present(unsigned long addr, unsigned long end) { int ret, len, write; - struct vm_area_struct * vma; + struct vm_area_struct *vma; vma = find_vma(current->mm, addr); write = (vma->vm_flags & VM_WRITE) != 0; @@ -1320,10 +1565,18 @@ BUG(); if (end > vma->vm_end) BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - return ret == len ? 0 : -1; + len = (end + PAGE_SIZE - 1) / PAGE_SIZE - addr / PAGE_SIZE; + + /* This is necessary for gup_mk_present to work and + * also a slight optimization. -ioe + */ + if (len == 0) + return 0; + + ret = walk_user_pages(current, current->mm, addr, + write, 0, gup_mk_present, &len); + + return (ret == 0 && len == 0) ? 0 : -1; } /*