From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Tue, 14 Mar 2000 18:13:19 -0500 (EST) From: Chuck Lever Subject: [PATCH] madvise() against 2.3.52-3 Message-ID: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: owner-linux-mm@kvack.org Return-Path: To: Linus Torvalds Cc: linux-mm@kvack.org List-ID: linus- here's madvise. comments welcome and appreciated. i'm not sure the madvise_dontneed implementation is quite correct. after reviewing the recent discussion on linux-mm and some of the logic in filemap_sync, i decided that the two different desirable behaviors are: 1. free clean pages, flush dirty pages - already available as msync(MS_INVALIDATE) 2. free clean and dirty pages, discarding changes - now available as madvise(MADV_DONTNEED) i think this gets us both behaviors jamie lokier advocated recently. diff -ruN Linux-2.3.52-3/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S --- Linux-2.3.52-3/arch/i386/kernel/entry.S Tue Mar 14 14:04:20 2000 +++ linux/arch/i386/kernel/entry.S Tue Mar 14 16:07:46 2000 @@ -639,6 +639,7 @@ .long SYMBOL_NAME(sys_setfsgid) .long SYMBOL_NAME(sys_pivot_root) .long SYMBOL_NAME(sys_mincore) + .long SYMBOL_NAME(sys_madvise) /* @@ -647,6 +648,6 @@ * entries. Don't panic if you notice that this hasn't * been shrunk every time we add a new system call. */ - .rept NR_syscalls-218 + .rept NR_syscalls-219 .long SYMBOL_NAME(sys_ni_syscall) .endr diff -ruN Linux-2.3.52-3/include/asm-i386/mman.h linux/include/asm-i386/mman.h --- Linux-2.3.52-3/include/asm-i386/mman.h Mon Oct 7 01:55:48 1996 +++ linux/include/asm-i386/mman.h Tue Mar 14 17:12:00 2000 @@ -25,6 +25,12 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ +#define MADV_NORMAL 0x0 /* default page-in behavior */ +#define MADV_RANDOM 0x1 /* page-in minimum required */ +#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ +#define MADV_WILLNEED 0x3 /* pre-fault pages */ +#define MADV_DONTNEED 0x4 /* discard these pages */ + /* compatibility flags */ #define MAP_ANON MAP_ANONYMOUS #define MAP_FILE 0 diff -ruN Linux-2.3.52-3/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h --- Linux-2.3.52-3/include/asm-i386/unistd.h Tue Mar 14 14:04:27 2000 +++ linux/include/asm-i386/unistd.h Tue Mar 14 17:13:06 2000 @@ -223,6 +223,8 @@ #define __NR_setfsgid32 216 #define __NR_pivot_root 217 #define __NR_mincore 218 +#define __NR_madvise 219 +#define __NR_madvise1 219 /* delete when C lib stub is removed */ /* user-visible error numbers are in the range -1 - -124: see */ diff -ruN Linux-2.3.52-3/include/linux/mm.h linux/include/linux/mm.h --- Linux-2.3.52-3/include/linux/mm.h Sun Mar 12 18:42:36 2000 +++ linux/include/linux/mm.h Tue Mar 14 17:49:44 2000 @@ -60,6 +60,7 @@ struct vm_operations_struct * vm_ops; unsigned long vm_pgoff; /* offset in PAGE_SIZE units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; + unsigned long vm_raend; void * vm_private_data; /* was vm_pte (shared mem) */ }; @@ -83,10 +84,19 @@ #define VM_EXECUTABLE 0x00001000 #define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_STACK_FLAGS 0x00000177 +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. @@ -105,7 +115,6 @@ void (*unmap)(struct vm_area_struct *area, unsigned long, size_t); void (*protect)(struct vm_area_struct *area, unsigned long, size_t, unsigned int newprot); int (*sync)(struct vm_area_struct *area, unsigned long, size_t, unsigned int flags); - void (*advise)(struct vm_area_struct *area, unsigned long, size_t, unsigned int advise); struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int write_access); struct page * (*wppage)(struct vm_area_struct * area, unsigned long address, struct page * page); int (*swapout)(struct page *, struct file *); diff -ruN Linux-2.3.52-3/mm/filemap.c linux/mm/filemap.c --- Linux-2.3.52-3/mm/filemap.c Tue Mar 14 14:04:28 2000 +++ linux/mm/filemap.c Tue Mar 14 18:02:26 2000 @@ -25,6 +25,7 @@ #include #include +#include #include @@ -1294,6 +1295,61 @@ } /* + * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are + * sure this is sequential access, we don't need a flexible read-ahead + * window size -- we can always use a large fixed size window. + */ +static void nopage_sequential_readahead(struct vm_area_struct * vma, + unsigned long pgoff, unsigned long filesize) +{ + unsigned long ra_window; + + ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); + ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); + + /* vm_raend is zero if we haven't read ahead in this area yet. */ + if (vma->vm_raend == 0) + vma->vm_raend = vma->vm_pgoff + ra_window; + + /* + * If we've just faulted the page half-way through our window, + * then schedule reads for the next window, and release the + * pages in the previous window. + */ + if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { + unsigned long start = vma->vm_pgoff + vma->vm_raend; + unsigned long end = start + ra_window; + + if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) + end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; + if (start > end) + return; + + while ((start < end) && (start < filesize)) { + if (read_cluster_nonblocking(vma->vm_file, + start, filesize) < 0) + break; + start += CLUSTER_PAGES; + } + run_task_queue(&tq_disk); + + /* if we're far enough past the beginning of this area, + recycle pages that are in the previous window. */ + if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { + unsigned long window = ra_window << PAGE_SHIFT; + + end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT); + end -= window + window; + filemap_sync(vma, end - window, window, MS_INVALIDATE); + } + + vma->vm_raend += ra_window; + } + + return; +} + +/* * filemap_nopage() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * @@ -1339,6 +1395,12 @@ goto page_not_uptodate; success: + /* + * Try read-ahead for sequential areas. + */ + if (VM_SequentialReadHint(area)) + nopage_sequential_readahead(area, pgoff, size); + /* * Found the page and have a reference on it, need to check sharing * and possibly copy it over to another page.. @@ -1355,7 +1417,7 @@ page_cache_release(page); return new_page; } - + flush_page_to_ram(old_page); return old_page; @@ -1367,7 +1429,7 @@ * Otherwise, we're off the end of a privately mapped file, * so we need to map a zero page. */ - if (pgoff < size) + if ((pgoff < size) && !VM_RandomReadHint(area)) error = read_cluster_nonblocking(file, pgoff, size); else error = page_cache_read(file, pgoff); @@ -1646,7 +1708,6 @@ return 0; } - /* * The msync() system call. */ @@ -1723,6 +1784,351 @@ } out: unlock_kernel(); + up(¤t->mm->mmap_sem); + return error; +} + +static inline void setup_read_behavior(struct vm_area_struct * vma, + int behavior) +{ + VM_ClearReadHint(vma); + switch(behavior) { + case MADV_SEQUENTIAL: + vma->vm_flags |= VM_SEQ_READ; + break; + case MADV_RANDOM: + vma->vm_flags |= VM_RAND_READ; + break; + default: + break; + } + return; +} + +static long madvise_fixup_start(struct vm_area_struct * vma, + unsigned long end, int behavior) +{ + struct vm_area_struct * n; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_end = end; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vmlist_modify_lock(vma->vm_mm); + vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = end; + insert_vm_struct(current->mm, n); + vmlist_modify_unlock(vma->vm_mm); + return 0; +} + +static long madvise_fixup_end(struct vm_area_struct * vma, + unsigned long start, int behavior) +{ + struct vm_area_struct * n; + + n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!n) + return -EAGAIN; + *n = *vma; + n->vm_start = start; + n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; + setup_read_behavior(n, behavior); + n->vm_raend = 0; + get_file(n->vm_file); + if (n->vm_ops && n->vm_ops->open) + n->vm_ops->open(n); + vmlist_modify_lock(vma->vm_mm); + vma->vm_end = start; + insert_vm_struct(current->mm, n); + vmlist_modify_unlock(vma->vm_mm); + return 0; +} + +static long madvise_fixup_middle(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + struct vm_area_struct * left, * right; + + left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!left) + return -EAGAIN; + right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!right) { + kmem_cache_free(vm_area_cachep, left); + return -EAGAIN; + } + *left = *vma; + *right = *vma; + left->vm_end = start; + right->vm_start = end; + right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; + atomic_add(2, &vma->vm_file->f_count); + + if (vma->vm_ops && vma->vm_ops->open) { + vma->vm_ops->open(left); + vma->vm_ops->open(right); + } + vmlist_modify_lock(vma->vm_mm); + vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = end; + setup_read_behavior(vma, behavior); + vma->vm_raend = 0; + insert_vm_struct(current->mm, left); + insert_vm_struct(current->mm, right); + vmlist_modify_unlock(vma->vm_mm); + return 0; +} + +/* + * We can potentially split a vm area into separate + * areas, each area with its own behavior. + */ +static long madvise_behavior(struct vm_area_struct * vma, + unsigned long start, unsigned long end, int behavior) +{ + int error = 0; + + /* This caps the number of vma's this process can own */ + if (vma->vm_mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + + if (start == vma->vm_start) { + if (end == vma->vm_end) { + setup_read_behavior(vma, behavior); + vma->vm_raend = 0; + } else + error = madvise_fixup_start(vma, end, behavior); + } else { + if (end == vma->vm_end) + error = madvise_fixup_end(vma, start, behavior); + else + error = madvise_fixup_middle(vma, start, end, behavior); + } + + return error; +} + +/* + * Schedule all required I/O operations, then run the disk queue + * to make sure they are started. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + long error = -EBADF; + struct file * file; + unsigned long size, rlim_rss; + + /* Doesn't work if there's no mapped file. */ + if (!vma->vm_file) + return error; + file = vma->vm_file; + size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (end > vma->vm_end) + end = vma->vm_end; + end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + + /* Make sure this doesn't exceed the process's max rss. */ + error = -EIO; + rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : + LONG_MAX; /* default: see resource.h */ + if ((vma->vm_mm->rss + (end - start)) > rlim_rss) + return error; + + /* round to cluster boundaries if this isn't a "random" area. */ + if (!VM_RandomReadHint(vma)) { + start = CLUSTER_OFFSET(start); + end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1); + + while ((start < end) && (start < size)) { + error = read_cluster_nonblocking(file, start, size); + start += CLUSTER_PAGES; + if (error < 0) + break; + } + } else { + while ((start < end) && (start < size)) { + error = page_cache_read(file, start); + start++; + if (error < 0) + break; + } + } + + /* Don't wait for someone else to push these requests. */ + run_task_queue(&tq_disk); + + return error; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range call sets things up for shrink_mmap to actually free + * these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * shrink_mmap to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed(struct vm_area_struct * vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_flags & VM_LOCKED) + return -EINVAL; + + lock_kernel(); /* is this really necessary? */ + + flush_cache_range(vma->vm_mm, start, end); + zap_page_range(vma->vm_mm, start, end - start); + flush_tlb_range(vma->vm_mm, start, end); + + unlock_kernel(); + return 0; +} + +static long madvise_vma(struct vm_area_struct * vma, unsigned long start, + unsigned long end, int behavior) +{ + long error = -EBADF; + + switch (behavior) { + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + error = madvise_behavior(vma, start, end, behavior); + break; + + case MADV_WILLNEED: + error = madvise_willneed(vma, start, end); + break; + + case MADV_DONTNEED: + error = madvise_dontneed(vma, start, end); + break; + + default: + error = -EINVAL; + break; + } + + return error; +} + +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) +{ + unsigned long end; + struct vm_area_struct * vma; + int unmapped_error = 0; + int error = -EINVAL; + + down(¤t->mm->mmap_sem); + + if (start & ~PAGE_MASK) + goto out; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + + error = 0; + if (end == start) + goto out; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + */ + vma = find_vma(current->mm, start); + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out; + + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + } + + /* Here vma->vm_start <= start < vma->vm_end. */ + if (end <= vma->vm_end) { + if (start < end) { + error = madvise_vma(vma, start, end, + behavior); + if (error) + goto out; + } + error = unmapped_error; + goto out; + } + + /* Here vma->vm_start <= start < vma->vm_end < end. */ + error = madvise_vma(vma, start, vma->vm_end, behavior); + if (error) + goto out; + start = vma->vm_end; + vma = vma->vm_next; + } + +out: up(¤t->mm->mmap_sem); return error; } diff -ruN Linux-2.3.52-3/mm/mlock.c linux/mm/mlock.c --- Linux-2.3.52-3/mm/mlock.c Mon Feb 7 22:45:28 2000 +++ linux/mm/mlock.c Tue Mar 14 16:21:08 2000 @@ -31,6 +31,7 @@ *n = *vma; n->vm_end = end; n->vm_flags = newflags; + n->vm_raend = 0; if (n->vm_file) get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) @@ -55,6 +56,7 @@ n->vm_start = start; n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; n->vm_flags = newflags; + n->vm_raend = 0; if (n->vm_file) get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) @@ -85,6 +87,8 @@ right->vm_start = end; right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; vma->vm_flags = newflags; + left->vm_raend = 0; + right->vm_raend = 0; if (vma->vm_file) atomic_add(2, &vma->vm_file->f_count); @@ -97,6 +101,7 @@ vma->vm_start = start; vma->vm_end = end; vma->vm_flags = newflags; + vma->vm_raend = 0; insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); vmlist_modify_unlock(vma->vm_mm); diff -ruN Linux-2.3.52-3/mm/mmap.c linux/mm/mmap.c --- Linux-2.3.52-3/mm/mmap.c Sun Mar 12 18:42:48 2000 +++ linux/mm/mmap.c Tue Mar 14 16:39:17 2000 @@ -249,6 +249,9 @@ vma->vm_flags = vm_flags(prot,flags) | mm->def_flags; if (file) { + VM_ClearReadHint(vma); + vma->vm_raend = 0; + if (file->f_mode & 1) vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_SHARED) { @@ -549,6 +552,7 @@ mpnt->vm_end = area->vm_end; mpnt->vm_page_prot = area->vm_page_prot; mpnt->vm_flags = area->vm_flags; + mpnt->vm_raend = 0; mpnt->vm_ops = area->vm_ops; mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT); mpnt->vm_file = area->vm_file; diff -ruN Linux-2.3.52-3/mm/mprotect.c linux/mm/mprotect.c --- Linux-2.3.52-3/mm/mprotect.c Mon Feb 7 22:45:28 2000 +++ linux/mm/mprotect.c Tue Mar 14 16:25:33 2000 @@ -105,6 +105,7 @@ *n = *vma; n->vm_end = end; n->vm_flags = newflags; + n->vm_raend = 0; n->vm_page_prot = prot; if (n->vm_file) get_file(n->vm_file); @@ -131,6 +132,7 @@ n->vm_start = start; n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; n->vm_flags = newflags; + n->vm_raend = 0; n->vm_page_prot = prot; if (n->vm_file) get_file(n->vm_file); @@ -162,6 +164,8 @@ left->vm_end = start; right->vm_start = end; right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; + left->vm_raend = 0; + right->vm_raend = 0; if (vma->vm_file) atomic_add(2,&vma->vm_file->f_count); if (vma->vm_ops && vma->vm_ops->open) { @@ -173,6 +177,7 @@ vma->vm_start = start; vma->vm_end = end; vma->vm_flags = newflags; + vma->vm_raend = 0; vma->vm_page_prot = prot; insert_vm_struct(current->mm, left); insert_vm_struct(current->mm, right); diff -ruN Linux-2.3.52-3/mm/mremap.c linux/mm/mremap.c --- Linux-2.3.52-3/mm/mremap.c Sun Mar 12 18:42:16 2000 +++ linux/mm/mremap.c Tue Mar 14 16:26:13 2000 @@ -135,8 +135,8 @@ *new_vma = *vma; new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff = vma->vm_pgoff; new_vma->vm_pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + new_vma->vm_raend = 0; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) - Chuck Lever -- corporate: personal: or The Linux Scalability project: http://www.citi.umich.edu/projects/linux-scalability/ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux.eu.org/Linux-MM/