From: Christoph Hellwig <hch@lst.de>
To: Andrew Morton <akpm@zip.com.au>
Cc: linux-mm@kvack.org
Date: Mon, 2 Sep 2002 21:04:43 +0200 [thread overview]
Message-ID: <20020902210443.A32010@lst.de> (raw)
In-Reply-To: <3D73B7F1.2EB3131E@zip.com.au>; from akpm@zip.com.au on Mon, Sep 02, 2002 at 12:11:45PM -0700
On Mon, Sep 02, 2002 at 12:11:45PM -0700, Andrew Morton wrote:
> Christoph Hellwig wrote:
> >
> > On Mon, Sep 02, 2002 at 11:50:33AM -0700, Andrew Morton wrote:
> > > Christoph Hellwig wrote:
> > > >
> > > > This patch was done after Linus requested it when I intended to split
> > > > madvice out of filemap.c. We extend splitvma() in mmap.c to take
> > > > another argument that specifies whether to split above or below the
> > > > address given, and thus can use it in those function, cleaning them up
> > > > a lot and removing most of their code.
> > > >
> > >
> > > This description seems to have leaked from a different patch.
> > >
> > > Your patch purely shuffles code about, yes?
> >
> > No. it makes madvise/mlock/mprotect use slit_vma (that involved from
> > splitvma). There is no change in behaviour (verified by ltp testruns),
> > but the implementation is very different, and lots of code is gone.
>
> did you send the right patch?
>
> mnm:/usr/src/25> grep split patches/madvise-move.patch
> - * We can potentially split a vm area into separate
> + * We can potentially split a vm area into separate
>
> mnm:/usr/src/25> diffstat patches/madvise-move.patch
> Makefile | 2
> filemap.c | 332 ------------------------------------------------------------
> madvise.c | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 342 insertions(+), 332 deletions(-)
Sorry, that was the first patch I sent to Linus before I did the changes
I explained..
Here's the right one:
include/linux/mm.h | 7 -
mm/Makefile | 2
mm/filemap.c | 332 -----------------------------------------------------
mm/madvise.c | 238 +++++++++++++++++++++++++++++++++++++
mm/mlock.c | 158 ++++---------------------
mm/mmap.c | 37 +++--
mm/mprotect.c | 218 +++++++++++-----------------------
7 files changed, 365 insertions, 627 deletions
--- 1.70/include/linux/mm.h Thu Aug 15 21:55:18 2002
+++ edited/include/linux/mm.h Sun Aug 18 16:14:27 2002
@@ -483,6 +483,7 @@
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
struct vm_area_struct **pprev);
+extern struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr);
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
NULL if none. Assume start_addr < end_addr. */
@@ -495,11 +496,11 @@
return vma;
}
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
+extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ unsigned long addr, int new_below);
extern struct page * vmalloc_to_page(void *addr);
extern unsigned long get_page_cache_size(void);
#endif /* __KERNEL__ */
-
-#endif
+#endif /* _LINUX_MM_H */
--- 1.12/mm/Makefile Tue Jul 16 23:46:26 2002
+++ edited/mm/Makefile Sun Aug 18 11:25:20 2002
@@ -16,6 +16,6 @@
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
- pdflush.o page-writeback.o rmap.o
+ pdflush.o page-writeback.o rmap.o madvise.o
include $(TOPDIR)/Rules.make
--- 1.127/mm/filemap.c Thu Aug 15 14:24:40 2002
+++ edited/mm/filemap.c Sun Aug 18 11:25:20 2002
@@ -1376,337 +1376,7 @@
return 0;
}
-static inline void setup_read_behavior(struct vm_area_struct * vma,
- int behavior)
-{
- VM_ClearReadHint(vma);
- switch(behavior) {
- case MADV_SEQUENTIAL:
- vma->vm_flags |= VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- vma->vm_flags |= VM_RAND_READ;
- break;
- default:
- break;
- }
- return;
-}
-
-static long madvise_fixup_start(struct vm_area_struct * vma,
- unsigned long end, int behavior)
-{
- struct vm_area_struct * n;
- struct mm_struct * mm = vma->vm_mm;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_end = end;
- setup_read_behavior(n, behavior);
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&mm->page_table_lock);
- vma->vm_start = end;
- __insert_vm_struct(mm, n);
- spin_unlock(&mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-static long madvise_fixup_end(struct vm_area_struct * vma,
- unsigned long start, int behavior)
-{
- struct vm_area_struct * n;
- struct mm_struct * mm = vma->vm_mm;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_start = start;
- n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
- setup_read_behavior(n, behavior);
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&mm->page_table_lock);
- vma->vm_end = start;
- __insert_vm_struct(mm, n);
- spin_unlock(&mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-static long madvise_fixup_middle(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, int behavior)
-{
- struct vm_area_struct * left, * right;
- struct mm_struct * mm = vma->vm_mm;
-
- left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!left)
- return -EAGAIN;
- right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!right) {
- kmem_cache_free(vm_area_cachep, left);
- return -EAGAIN;
- }
- *left = *vma;
- *right = *vma;
- left->vm_end = start;
- right->vm_start = end;
- right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
- left->vm_raend = 0;
- right->vm_raend = 0;
- if (vma->vm_file)
- atomic_add(2, &vma->vm_file->f_count);
-
- if (vma->vm_ops && vma->vm_ops->open) {
- vma->vm_ops->open(left);
- vma->vm_ops->open(right);
- }
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
- vma->vm_raend = 0;
- lock_vma_mappings(vma);
- spin_lock(&mm->page_table_lock);
- vma->vm_start = start;
- vma->vm_end = end;
- setup_read_behavior(vma, behavior);
- __insert_vm_struct(mm, left);
- __insert_vm_struct(mm, right);
- spin_unlock(&mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-/*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
- */
-static long madvise_behavior(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, int behavior)
-{
- int error = 0;
-
- /* This caps the number of vma's this process can own */
- if (vma->vm_mm->map_count > MAX_MAP_COUNT)
- return -ENOMEM;
-
- if (start == vma->vm_start) {
- if (end == vma->vm_end) {
- setup_read_behavior(vma, behavior);
- vma->vm_raend = 0;
- } else
- error = madvise_fixup_start(vma, end, behavior);
- } else {
- if (end == vma->vm_end)
- error = madvise_fixup_end(vma, start, behavior);
- else
- error = madvise_fixup_middle(vma, start, end, behavior);
- }
-
- return error;
-}
-
-/*
- * Schedule all required I/O operations, then run the disk queue
- * to make sure they are started. Do not wait for completion.
- */
-static long madvise_willneed(struct vm_area_struct * vma,
- unsigned long start, unsigned long end)
-{
- long error = -EBADF;
- struct file * file;
- unsigned long size, rlim_rss;
-
- /* Doesn't work if there's no mapped file. */
- if (!vma->vm_file)
- return error;
- file = vma->vm_file;
- size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
-
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-
- /* Make sure this doesn't exceed the process's max rss. */
- error = -EIO;
- rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
- LONG_MAX; /* default: see resource.h */
- if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
- return error;
-
- do_page_cache_readahead(file, start, end - start);
- return 0;
-}
-
-/*
- * Application no longer needs these pages. If the pages are dirty,
- * it's OK to just throw them away. The app will be more careful about
- * data it wants to keep. Be sure to free swap resources too. The
- * zap_page_range call sets things up for refill_inactive to actually free
- * these pages later if no one else has touched them in the meantime,
- * although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
- *
- * NB: This interface discards data rather than pushes it out to swap,
- * as some implementations do. This has performance implications for
- * applications like large transactional databases which want to discard
- * pages in anonymous maps after committing to backing store the data
- * that was kept in them. There is no reason to write this data out to
- * the swap area if the application is discarding it.
- *
- * An interface that causes the system to free clean pages and flush
- * dirty pages is already available as msync(MS_INVALIDATE).
- */
-static long madvise_dontneed(struct vm_area_struct * vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_flags & VM_LOCKED)
- return -EINVAL;
-
- zap_page_range(vma, start, end - start);
- return 0;
-}
-
-static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
- unsigned long end, int behavior)
-{
- long error = -EBADF;
-
- switch (behavior) {
- case MADV_NORMAL:
- case MADV_SEQUENTIAL:
- case MADV_RANDOM:
- error = madvise_behavior(vma, start, end, behavior);
- break;
-
- case MADV_WILLNEED:
- error = madvise_willneed(vma, start, end);
- break;
-
- case MADV_DONTNEED:
- error = madvise_dontneed(vma, start, end);
- break;
-
- default:
- error = -EINVAL;
- break;
- }
-
- return error;
-}
-
-/*
- * The madvise(2) system call.
- *
- * Applications can use madvise() to advise the kernel how it should
- * handle paging I/O in this VM area. The idea is to help the kernel
- * use appropriate read-ahead and caching techniques. The information
- * provided is advisory only, and can be safely disregarded by the
- * kernel without affecting the correct operation of the application.
- *
- * behavior values:
- * MADV_NORMAL - the default behavior is to read clusters. This
- * results in some read-ahead and read-behind.
- * MADV_RANDOM - the system should read the minimum amount of data
- * on any access, since it is unlikely that the appli-
- * cation will need more than what it asks for.
- * MADV_SEQUENTIAL - pages in the given range will probably be accessed
- * once, so they can be aggressively read ahead, and
- * can be freed soon after they are accessed.
- * MADV_WILLNEED - the application is notifying the system to read
- * some pages ahead.
- * MADV_DONTNEED - the application is finished with the given range,
- * so the kernel can free resources associated with it.
- *
- * return values:
- * zero - success
- * -EINVAL - start + len < 0, start is not page-aligned,
- * "behavior" is not a valid value, or application
- * is attempting to release locked or shared pages.
- * -ENOMEM - addresses in the specified range are not currently
- * mapped, or are outside the AS of the process.
- * -EIO - an I/O error occurred while paging in data.
- * -EBADF - map exists, but area maps something that isn't a file.
- * -EAGAIN - a kernel resource was temporarily unavailable.
- */
-asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
-{
- unsigned long end;
- struct vm_area_struct * vma;
- int unmapped_error = 0;
- int error = -EINVAL;
-
- down_write(¤t->mm->mmap_sem);
-
- if (start & ~PAGE_MASK)
- goto out;
- len = (len + ~PAGE_MASK) & PAGE_MASK;
- end = start + len;
- if (end < start)
- goto out;
-
- error = 0;
- if (end == start)
- goto out;
-
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- */
- vma = find_vma(current->mm, start);
- for (;;) {
- /* Still start < end. */
- error = -ENOMEM;
- if (!vma)
- goto out;
-
- /* Here start < vma->vm_end. */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- }
-
- /* Here vma->vm_start <= start < vma->vm_end. */
- if (end <= vma->vm_end) {
- if (start < end) {
- error = madvise_vma(vma, start, end,
- behavior);
- if (error)
- goto out;
- }
- error = unmapped_error;
- goto out;
- }
-
- /* Here vma->vm_start <= start < vma->vm_end < end. */
- error = madvise_vma(vma, start, vma->vm_end, behavior);
- if (error)
- goto out;
- start = vma->vm_end;
- vma = vma->vm_next;
- }
-
-out:
- up_write(¤t->mm->mmap_sem);
- return error;
-}
-
-static inline
-struct page *__read_cache_page(struct address_space *mapping,
+static inline struct page *__read_cache_page(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),
void *data)
--- 1.3/mm/mlock.c Tue Feb 5 08:45:30 2002
+++ edited/mm/mlock.c Sun Aug 18 16:02:43 2002
@@ -2,147 +2,49 @@
* linux/mm/mlock.c
*
* (C) Copyright 1995 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
*/
-#include <linux/slab.h>
-#include <linux/shm.h>
-#include <linux/mman.h>
-#include <linux/smp_lock.h>
-#include <linux/pagemap.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-
-static inline int mlock_fixup_all(struct vm_area_struct * vma, int newflags)
-{
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_flags = newflags;
- spin_unlock(&vma->vm_mm->page_table_lock);
- return 0;
-}
-
-static inline int mlock_fixup_start(struct vm_area_struct * vma,
- unsigned long end, int newflags)
-{
- struct vm_area_struct * n;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_end = end;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_start = end;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-
-static inline int mlock_fixup_end(struct vm_area_struct * vma,
- unsigned long start, int newflags)
-{
- struct vm_area_struct * n;
-
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -EAGAIN;
- *n = *vma;
- n->vm_start = start;
- n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_end = start;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
-static inline int mlock_fixup_middle(struct vm_area_struct * vma,
- unsigned long start, unsigned long end, int newflags)
-{
- struct vm_area_struct * left, * right;
+#include <linux/mman.h>
+#include <linux/mm.h>
- left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!left)
- return -EAGAIN;
- right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!right) {
- kmem_cache_free(vm_area_cachep, left);
- return -EAGAIN;
- }
- *left = *vma;
- *right = *vma;
- left->vm_end = start;
- right->vm_start = end;
- right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
- vma->vm_flags = newflags;
- left->vm_raend = 0;
- right->vm_raend = 0;
- if (vma->vm_file)
- atomic_add(2, &vma->vm_file->f_count);
-
- if (vma->vm_ops && vma->vm_ops->open) {
- vma->vm_ops->open(left);
- vma->vm_ops->open(right);
- }
- vma->vm_raend = 0;
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_flags = newflags;
- __insert_vm_struct(current->mm, left);
- __insert_vm_struct(current->mm, right);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
- return 0;
-}
static int mlock_fixup(struct vm_area_struct * vma,
unsigned long start, unsigned long end, unsigned int newflags)
{
- int pages, retval;
+ struct mm_struct * mm = vma->vm_mm;
+ int pages, error;
if (newflags == vma->vm_flags)
return 0;
- if (start == vma->vm_start) {
- if (end == vma->vm_end)
- retval = mlock_fixup_all(vma, newflags);
- else
- retval = mlock_fixup_start(vma, end, newflags);
- } else {
- if (end == vma->vm_end)
- retval = mlock_fixup_end(vma, start, newflags);
- else
- retval = mlock_fixup_middle(vma, start, end, newflags);
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ return -EAGAIN;
}
- if (!retval) {
- /* keep track of amount of locked VM */
- pages = (end - start) >> PAGE_SHIFT;
- if (newflags & VM_LOCKED) {
- pages = -pages;
- make_pages_present(start, end);
- }
- vma->vm_mm->locked_vm -= pages;
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ return -EAGAIN;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ vma->vm_flags = newflags;
+ spin_unlock(&mm->page_table_lock);
+
+ /*
+ * Keep track of amount of locked VM.
+ */
+ pages = (end - start) >> PAGE_SHIFT;
+ if (newflags & VM_LOCKED) {
+ pages = -pages;
+ make_pages_present(start, end);
}
- return retval;
+
+ vma->vm_mm->locked_vm -= pages;
+ return 0;
}
static int do_mlock(unsigned long start, size_t len, int on)
--- 1.45/mm/mmap.c Fri Aug 2 16:24:26 2002
+++ edited/mm/mmap.c Sun Aug 18 16:13:02 2002
@@ -1043,10 +1043,11 @@
}
/*
- * Split a vma into two pieces at address 'addr', the original vma
- * will contain the first part, a new vma is allocated for the tail.
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the the tail.
*/
-static int splitvma(struct mm_struct *mm, struct vm_area_struct *mpnt, unsigned long addr)
+int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+ unsigned long addr, int new_below)
{
struct vm_area_struct *new;
@@ -1058,22 +1059,28 @@
return -ENOMEM;
/* most fields are the same, copy all, and then fixup */
- *new = *mpnt;
+ *new = *vma;
+
+ if (new_below) {
+ vma->vm_start = new->vm_end = addr;
+ vma->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ } else {
+ new->vm_start = vma->vm_end = addr;
+ new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ }
- new->vm_start = addr;
- new->vm_pgoff = mpnt->vm_pgoff + ((addr - mpnt->vm_start) >> PAGE_SHIFT);
new->vm_raend = 0;
- if (mpnt->vm_file)
- get_file(mpnt->vm_file);
- if (mpnt->vm_ops && mpnt->vm_ops->open)
- mpnt->vm_ops->open(mpnt);
- mpnt->vm_end = addr; /* Truncate area */
+ if (new->vm_file)
+ get_file(new->vm_file);
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
spin_lock(&mm->page_table_lock);
- lock_vma_mappings(mpnt);
+ lock_vma_mappings(vma);
__insert_vm_struct(mm, new);
- unlock_vma_mappings(mpnt);
+ unlock_vma_mappings(vma);
spin_unlock(&mm->page_table_lock);
return 0;
@@ -1110,7 +1117,7 @@
* If we need to split any vma, do it now to save pain later.
*/
if (start > mpnt->vm_start) {
- if (splitvma(mm, mpnt, start))
+ if (split_vma(mm, mpnt, start, 0))
return -ENOMEM;
prev = mpnt;
mpnt = mpnt->vm_next;
@@ -1119,7 +1126,7 @@
/* Does it split the last one? */
last = find_vma(mm, end);
if (last && end > last->vm_start) {
- if (splitvma(mm, last, end))
+ if (split_vma(mm, last, end, 0))
return -ENOMEM;
}
--- 1.14/mm/mprotect.c Mon Jul 29 21:23:46 2002
+++ edited/mm/mprotect.c Sun Aug 18 16:20:40 2002
@@ -2,13 +2,14 @@
* mm/mprotect.c
*
* (C) Copyright 1994 Linus Torvalds
+ * (C) Copyright 2002 Christoph Hellwig
*
* Address space accounting code <alan@redhat.com>
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
+
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/fs.h>
@@ -100,158 +101,59 @@
spin_unlock(¤t->mm->page_table_lock);
return;
}
-
-static inline int mprotect_fixup_all(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- int newflags, pgprot_t prot)
+/*
+ * Try to merge a vma with the previos flag, return 1 if successfull or 0 if it
+ * was impossible.
+ */
+static int mprotect_attemp_merge(struct vm_area_struct * vma,
+ struct vm_area_struct * prev,
+ unsigned long end, int newflags)
{
- struct vm_area_struct * prev = *pprev;
struct mm_struct * mm = vma->vm_mm;
- if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
- !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
+ if (!prev || !vma)
+ return 0;
+ if (prev->vm_end != vma->vm_start)
+ return 0;
+ if (!can_vma_merge(prev, newflags))
+ return 0;
+ if (vma->vm_file || (vma->vm_flags & VM_SHARED))
+ return 0;
+
+ /*
+ * If the whole area changes to the protection of the previous one
+ * we can just get rid of it.
+ */
+ if (end == vma->vm_end) {
spin_lock(&mm->page_table_lock);
- prev->vm_end = vma->vm_end;
+ prev->vm_end = end;
__vma_unlink(mm, vma, prev);
spin_unlock(&mm->page_table_lock);
kmem_cache_free(vm_area_cachep, vma);
mm->map_count--;
+ return 1;
+ }
- return 0;
- }
-
+ /*
+ * Otherwise extend it.
+ */
spin_lock(&mm->page_table_lock);
- vma->vm_flags = newflags;
- vma->vm_page_prot = prot;
- spin_unlock(&mm->page_table_lock);
-
- *pprev = vma;
-
- return 0;
-}
-
-static inline int mprotect_fixup_start(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- unsigned long end,
- int newflags, pgprot_t prot)
-{
- struct vm_area_struct * n, * prev = *pprev;
-
- *pprev = vma;
-
- if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) &&
- !vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
- spin_lock(&vma->vm_mm->page_table_lock);
- prev->vm_end = end;
- vma->vm_start = end;
- spin_unlock(&vma->vm_mm->page_table_lock);
-
- return 0;
- }
- n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!n)
- return -ENOMEM;
- *n = *vma;
- n->vm_end = end;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- n->vm_page_prot = prot;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
+ prev->vm_end = end;
vma->vm_start = end;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
-
- return 0;
-}
-
-static inline int mprotect_fixup_end(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- unsigned long start,
- int newflags, pgprot_t prot)
-{
- struct vm_area_struct * n;
-
- n = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!n)
- return -ENOMEM;
- *n = *vma;
- n->vm_start = start;
- n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
- n->vm_flags = newflags;
- n->vm_raend = 0;
- n->vm_page_prot = prot;
- if (n->vm_file)
- get_file(n->vm_file);
- if (n->vm_ops && n->vm_ops->open)
- n->vm_ops->open(n);
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_end = start;
- __insert_vm_struct(current->mm, n);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
-
- *pprev = n;
-
- return 0;
+ spin_unlock(&mm->page_table_lock);
+ return 1;
}
-static inline int mprotect_fixup_middle(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
- unsigned long start, unsigned long end,
- int newflags, pgprot_t prot)
-{
- struct vm_area_struct * left, * right;
-
- left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!left)
- return -ENOMEM;
- right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
- if (!right) {
- kmem_cache_free(vm_area_cachep, left);
- return -ENOMEM;
- }
- *left = *vma;
- *right = *vma;
- left->vm_end = start;
- right->vm_start = end;
- right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
- left->vm_raend = 0;
- right->vm_raend = 0;
- if (vma->vm_file)
- atomic_add(2,&vma->vm_file->f_count);
- if (vma->vm_ops && vma->vm_ops->open) {
- vma->vm_ops->open(left);
- vma->vm_ops->open(right);
- }
- vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
- vma->vm_raend = 0;
- vma->vm_page_prot = prot;
- lock_vma_mappings(vma);
- spin_lock(&vma->vm_mm->page_table_lock);
- vma->vm_start = start;
- vma->vm_end = end;
- vma->vm_flags = newflags;
- __insert_vm_struct(current->mm, left);
- __insert_vm_struct(current->mm, right);
- spin_unlock(&vma->vm_mm->page_table_lock);
- unlock_vma_mappings(vma);
-
- *pprev = right;
- return 0;
-}
static int mprotect_fixup(struct vm_area_struct * vma, struct vm_area_struct ** pprev,
unsigned long start, unsigned long end, unsigned int newflags)
{
+ struct mm_struct * mm = vma->vm_mm;
+ unsigned long charged = 0;
pgprot_t newprot;
int error;
- unsigned long charged = 0;
if (newflags == vma->vm_flags) {
*pprev = vma;
@@ -266,29 +168,46 @@
* FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
* a MAP_NORESERVE private mapping to writable will now reserve.
*/
- if ((newflags & VM_WRITE) &&
- !(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
- charged = (end - start) >> PAGE_SHIFT;
- if (!vm_enough_memory(charged))
- return -ENOMEM;
- newflags |= VM_ACCOUNT;
+ if (newflags & VM_WRITE) {
+ if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+ charged = (end - start) >> PAGE_SHIFT;
+ if (!vm_enough_memory(charged))
+ return -ENOMEM;
+ newflags |= VM_ACCOUNT;
+ }
}
+
newprot = protection_map[newflags & 0xf];
+
if (start == vma->vm_start) {
- if (end == vma->vm_end)
- error = mprotect_fixup_all(vma, pprev, newflags, newprot);
- else
- error = mprotect_fixup_start(vma, pprev, end, newflags, newprot);
- } else if (end == vma->vm_end)
- error = mprotect_fixup_end(vma, pprev, start, newflags, newprot);
- else
- error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot);
- if (error) {
- vm_unacct_memory(charged);
- return error;
+ /*
+ * Try to merge with the previous vma.
+ */
+ if (mprotect_attemp_merge(vma, *pprev, end, newflags))
+ return 0;
+ } else {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto fail;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto fail;
}
+
+ spin_lock(&mm->page_table_lock);
+ vma->vm_flags = newflags;
+ vma->vm_page_prot = newprot;
+ spin_unlock(&mm->page_table_lock);
+
change_protection(vma, start, end, newprot);
return 0;
+
+fail:
+ vm_unacct_memory(charged);
+ return error;
}
asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
@@ -352,6 +271,7 @@
goto out;
}
}
+
if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) &&
!prev->vm_file && !(prev->vm_flags & VM_SHARED)) {
spin_lock(&prev->vm_mm->page_table_lock);
--- 1.0/mm/madvise.c Thu Dec 13 11:34:58 2001
+++ edited/mm/madvise.c Sun Aug 18 14:28:08 2002
@@ -0,0 +1,238 @@
+/*
+ * linux/mm/madvise.c
+ *
+ * Copyright (C) 1999 Linus Torvalds
+ * Copyright (C) 2002 Christoph Hellwig
+ */
+
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+
+
+/*
+ * We can potentially split a vm area into separate
+ * areas, each area with its own behavior.
+ */
+static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
+ unsigned long end, int behavior)
+{
+ struct mm_struct * mm = vma->vm_mm;
+ int error;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ return -EAGAIN;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ return -EAGAIN;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ vma->vm_raend = 0;
+ VM_ClearReadHint(vma);
+
+ switch (behavior) {
+ case MADV_SEQUENTIAL:
+ vma->vm_flags |= VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ vma->vm_flags |= VM_RAND_READ;
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Schedule all required I/O operations, then run the disk queue
+ * to make sure they are started. Do not wait for completion.
+ */
+static long madvise_willneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ long error = -EBADF;
+ struct file * file;
+ unsigned long size, rlim_rss;
+
+ /* Doesn't work if there's no mapped file. */
+ if (!vma->vm_file)
+ return error;
+ file = vma->vm_file;
+ size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+ /* Make sure this doesn't exceed the process's max rss. */
+ error = -EIO;
+ rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur :
+ LONG_MAX; /* default: see resource.h */
+ if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
+ return error;
+
+ do_page_cache_readahead(file, start, end - start);
+ return 0;
+}
+
+/*
+ * Application no longer needs these pages. If the pages are dirty,
+ * it's OK to just throw them away. The app will be more careful about
+ * data it wants to keep. Be sure to free swap resources too. The
+ * zap_page_range call sets things up for refill_inactive to actually free
+ * these pages later if no one else has touched them in the meantime,
+ * although we could add these pages to a global reuse list for
+ * refill_inactive to pick up before reclaiming other pages.
+ *
+ * NB: This interface discards data rather than pushes it out to swap,
+ * as some implementations do. This has performance implications for
+ * applications like large transactional databases which want to discard
+ * pages in anonymous maps after committing to backing store the data
+ * that was kept in them. There is no reason to write this data out to
+ * the swap area if the application is discarding it.
+ *
+ * An interface that causes the system to free clean pages and flush
+ * dirty pages is already available as msync(MS_INVALIDATE).
+ */
+static long madvise_dontneed(struct vm_area_struct * vma,
+ unsigned long start, unsigned long end)
+{
+ if (vma->vm_flags & VM_LOCKED)
+ return -EINVAL;
+
+ zap_page_range(vma, start, end - start);
+ return 0;
+}
+
+static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+ unsigned long end, int behavior)
+{
+ long error = -EBADF;
+
+ switch (behavior) {
+ case MADV_NORMAL:
+ case MADV_SEQUENTIAL:
+ case MADV_RANDOM:
+ error = madvise_behavior(vma, start, end, behavior);
+ break;
+
+ case MADV_WILLNEED:
+ error = madvise_willneed(vma, start, end);
+ break;
+
+ case MADV_DONTNEED:
+ error = madvise_dontneed(vma, start, end);
+ break;
+
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * The madvise(2) system call.
+ *
+ * Applications can use madvise() to advise the kernel how it should
+ * handle paging I/O in this VM area. The idea is to help the kernel
+ * use appropriate read-ahead and caching techniques. The information
+ * provided is advisory only, and can be safely disregarded by the
+ * kernel without affecting the correct operation of the application.
+ *
+ * behavior values:
+ * MADV_NORMAL - the default behavior is to read clusters. This
+ * results in some read-ahead and read-behind.
+ * MADV_RANDOM - the system should read the minimum amount of data
+ * on any access, since it is unlikely that the appli-
+ * cation will need more than what it asks for.
+ * MADV_SEQUENTIAL - pages in the given range will probably be accessed
+ * once, so they can be aggressively read ahead, and
+ * can be freed soon after they are accessed.
+ * MADV_WILLNEED - the application is notifying the system to read
+ * some pages ahead.
+ * MADV_DONTNEED - the application is finished with the given range,
+ * so the kernel can free resources associated with it.
+ *
+ * return values:
+ * zero - success
+ * -EINVAL - start + len < 0, start is not page-aligned,
+ * "behavior" is not a valid value, or application
+ * is attempting to release locked or shared pages.
+ * -ENOMEM - addresses in the specified range are not currently
+ * mapped, or are outside the AS of the process.
+ * -EIO - an I/O error occurred while paging in data.
+ * -EBADF - map exists, but area maps something that isn't a file.
+ * -EAGAIN - a kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
+{
+ unsigned long end;
+ struct vm_area_struct * vma;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ down_write(¤t->mm->mmap_sem);
+
+ if (start & ~PAGE_MASK)
+ goto out;
+ len = (len + ~PAGE_MASK) & PAGE_MASK;
+ end = start + len;
+ if (end < start)
+ goto out;
+
+ error = 0;
+ if (end == start)
+ goto out;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ */
+ vma = find_vma(current->mm, start);
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ goto out;
+
+ /* Here start < vma->vm_end. */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end. */
+ if (end <= vma->vm_end) {
+ if (start < end) {
+ error = madvise_vma(vma, start, end,
+ behavior);
+ if (error)
+ goto out;
+ }
+ error = unmapped_error;
+ goto out;
+ }
+
+ /* Here vma->vm_start <= start < vma->vm_end < end. */
+ error = madvise_vma(vma, start, vma->vm_end, behavior);
+ if (error)
+ goto out;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ }
+
+out:
+ up_write(¤t->mm->mmap_sem);
+ return error;
+}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/
next prev parent reply other threads:[~2002-09-02 19:04 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-09-02 17:43 Christoph Hellwig
2002-09-02 18:50 ` Andrew Morton
2002-09-02 18:41 ` Christoph Hellwig
2002-09-02 19:11 ` Andrew Morton
2002-09-02 19:04 ` Christoph Hellwig [this message]
2002-09-02 21:17 ` Andrew Morton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20020902210443.A32010@lst.de \
--to=hch@lst.de \
--cc=akpm@zip.com.au \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox