linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mincore for i386, against 2.3.51
@ 2000-03-13  0:45 Chuck Lever
  2000-03-13 17:46 ` Kanoj Sarcar
  0 siblings, 1 reply; 8+ messages in thread
From: Chuck Lever @ 2000-03-13  0:45 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-mm

hi linus-

here's mincore for i386.  this is simpler than madvise, so we should be
able to detect my misunderstandings a little easier before i go on with
madvise.

diff -ruN Linux-2.3.51/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
--- Linux-2.3.51/arch/i386/kernel/entry.S	Sun Mar 12 18:42:20 2000
+++ linux/arch/i386/kernel/entry.S	Sun Mar 12 18:47:04 2000
@@ -638,6 +638,7 @@
 	.long SYMBOL_NAME(sys_setfsuid)		/* 215 */
 	.long SYMBOL_NAME(sys_setfsgid)
 	.long SYMBOL_NAME(sys_pivot_root)
+	.long SYMBOL_NAME(sys_mincore)
 
 
 	/*
@@ -646,6 +647,6 @@
 	 * entries. Don't panic if you notice that this hasn't
 	 * been shrunk every time we add a new system call.
 	 */
-	.rept NR_syscalls-217
+	.rept NR_syscalls-218
 		.long SYMBOL_NAME(sys_ni_syscall)
 	.endr
diff -ruN Linux-2.3.51/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h
--- Linux-2.3.51/include/asm-i386/unistd.h	Wed Jan 26 15:32:02 2000
+++ linux/include/asm-i386/unistd.h	Sun Mar 12 18:50:55 2000
@@ -222,6 +222,7 @@
 #define __NR_setfsuid32		215
 #define __NR_setfsgid32		216
 #define __NR_pivot_root		217
+#define __NR_mincore		218
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff -ruN Linux-2.3.51/include/linux/mm.h linux/include/linux/mm.h
--- Linux-2.3.51/include/linux/mm.h	Sun Mar 12 18:42:36 2000
+++ linux/include/linux/mm.h	Sun Mar 12 19:17:15 2000
@@ -105,6 +105,7 @@
 	void (*unmap)(struct vm_area_struct *area, unsigned long, size_t);
 	void (*protect)(struct vm_area_struct *area, unsigned long, size_t, unsigned int newprot);
 	int (*sync)(struct vm_area_struct *area, unsigned long, size_t, unsigned int flags);
+	unsigned char (*incore)(struct vm_area_struct *area, unsigned long);
 	void (*advise)(struct vm_area_struct *area, unsigned long, size_t, unsigned int advise);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int write_access);
 	struct page * (*wppage)(struct vm_area_struct * area, unsigned long address, struct page * page);
@@ -446,6 +447,8 @@
 			size_t size, unsigned int flags);
 extern struct page *filemap_nopage(struct vm_area_struct * area,
 				    unsigned long address, int no_share);
+extern unsigned char filemap_incore(struct vm_area_struct * vma,
+	unsigned long pgoff);
 
 /*
  * GFP bitmasks..
diff -ruN Linux-2.3.51/ipc/shm.c linux/ipc/shm.c
--- Linux-2.3.51/ipc/shm.c	Sun Mar 12 18:42:48 2000
+++ linux/ipc/shm.c	Sun Mar 12 19:35:23 2000
@@ -115,6 +115,7 @@
 static void killseg_core(struct shmid_kernel *shp, int doacc);
 static void shm_open (struct vm_area_struct *shmd);
 static void shm_close (struct vm_area_struct *shmd);
+static unsigned char shm_incore (struct vm_area_struct *shmd, unsigned long idx);
 static struct page * shm_nopage(struct vm_area_struct *, unsigned long, int);
 static int shm_swapout(struct page *, struct file *);
 #ifdef CONFIG_PROC_FS
@@ -166,6 +167,7 @@
 static struct vm_operations_struct shm_vm_ops = {
 	open:	shm_open,	/* callback for a new vm-area open */
 	close:	shm_close,	/* callback for when the vm-area is released */
+	incore:	shm_incore,
 	nopage:	shm_nopage,
 	swapout:shm_swapout,
 };
@@ -1197,6 +1199,38 @@
 static int shm_swapout(struct page * page, struct file *file)
 {
 	return 0;
+}
+
+/*
+ * is page in memory?
+ *
+ * shm has a special incore method because we need to synchronize
+ * with the shm swapper (shm_swap) while finding the page.
+ */
+static unsigned char shm_incore(struct vm_area_struct * shmd,
+	unsigned long idx)
+{
+	unsigned char present = 0;
+	pte_t pte;
+	struct shmid_kernel * shp;
+	struct inode * inode = shmd->vm_file->f_dentry->d_inode;
+
+	down(&inode->i_sem);
+	if(!(shp = shm_lock(inode->i_ino)))
+		BUG();
+
+	/*
+	 * the pte isn't present if the page is swapped, or if it hasn't
+	 * been touched yet.  Otherwise, we say the page is available.
+	 */
+	pte = SHM_ENTRY(shp, (unsigned int)idx);
+	if (pte_present(pte))
+		present = 1;
+
+	shm_unlock(inode->i_ino);
+	up(&inode->i_sem);
+
+	return present;
 }
 
 /*
diff -ruN Linux-2.3.51/mm/filemap.c linux/mm/filemap.c
--- Linux-2.3.51/mm/filemap.c	Sun Mar 12 18:42:48 2000
+++ linux/mm/filemap.c	Sun Mar 12 19:00:48 2000
@@ -1294,6 +1294,28 @@
 }
 
 /*
+ * Later we can get more picky about what "in core" means precisely
+ * for a filemapped page.  For now, simply check to see if the page
+ * is in the page cache, and is up to date; i.e. that no page-in
+ * operation would be required at this time if an application were
+ * to map and access this page.
+ */
+unsigned char filemap_incore(struct vm_area_struct * vma, unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
+	struct page * page, ** hash = page_hash(as, pgoff);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(as, pgoff, *hash);
+	if ((page) && (Page_Uptodate(page)))
+		present = 1;
+	spin_unlock(&pagecache_lock);
+
+	return present;
+}
+
+/*
  * filemap_nopage() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
@@ -1610,6 +1632,7 @@
 static struct vm_operations_struct file_shared_mmap = {
 	unmap:		filemap_unmap,		/* unmap - we need to sync the pages */
 	sync:		filemap_sync,
+	incore:		filemap_incore,
 	nopage:		filemap_nopage,
 	swapout:	filemap_swapout,
 };
@@ -1621,6 +1644,7 @@
  * know they can't ever get write permissions..)
  */
 static struct vm_operations_struct file_private_mmap = {
+	incore:		filemap_incore,
 	nopage:		filemap_nopage,
 };
 
diff -ruN Linux-2.3.51/mm/mmap.c linux/mm/mmap.c
--- Linux-2.3.51/mm/mmap.c	Sun Mar 12 18:42:48 2000
+++ linux/mm/mmap.c	Sun Mar 12 19:06:22 2000
@@ -731,6 +731,140 @@
 	return ret;
 }
 
+static long mincore_area(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+	unsigned char (*incore)(struct vm_area_struct * , unsigned long);
+
+	error = -ENOMEM;
+	if (!vma->vm_ops || !vma->vm_ops->incore)
+		return error;
+	incore = vma->vm_ops->incore;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = incore(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ *		or len has a nonpositive value
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char * vec)
+{
+	int index = 0;
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	long error = -EINVAL;
+
+	down(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_area(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_area(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up(&current->mm->mmap_sem);
+	return error;
+}
+
 /*
  *  this is really a simplified "do_mmap".  it only handles
  *  anonymous maps.  eventually we may be able to do some

	- Chuck Lever
--
corporate:	<chuckl@netscape.com>
personal:	<chucklever@netscape.net> or <cel@monkey.org>

The Linux Scalability project:
	http://www.citi.umich.edu/projects/linux-scalability/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13  0:45 [PATCH] mincore for i386, against 2.3.51 Chuck Lever
@ 2000-03-13 17:46 ` Kanoj Sarcar
  2000-03-13 18:16   ` Chuck Lever
  0 siblings, 1 reply; 8+ messages in thread
From: Kanoj Sarcar @ 2000-03-13 17:46 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Linus Torvalds, linux-mm

Couple of things:

#1
>  static struct vm_operations_struct shm_vm_ops = {
>  	open:	shm_open,	/* callback for a new vm-area open */
>  	close:	shm_close,	/* callback for when the vm-area is released */
> +	incore:	shm_incore,
>  	nopage:	shm_nopage,
>  	swapout:shm_swapout,
>  };

shmzero_vm_ops should also probably have a incore function. /dev/zero is
quite similar to shm, except the locking protocol is a little different
(look at shmzero_nopage and shm_nopage), you should be able to seperate
out the shm incore() function into a basic routine/#define that both shm
and /dev/zero can use. Let me know if you need help with this.

#2. It wasn't very clear to me how MAP_ANON pages are being handled. Maybe
I did not read the patch closely enough.

#3. If you have the time, it might make sense to pump out the #pages via
/proc/pid/maps too (although I don't know whether that will break some
apps that already know the output format).

Kanoj
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13 17:46 ` Kanoj Sarcar
@ 2000-03-13 18:16   ` Chuck Lever
  2000-03-13 18:28     ` Kanoj Sarcar
  0 siblings, 1 reply; 8+ messages in thread
From: Chuck Lever @ 2000-03-13 18:16 UTC (permalink / raw)
  To: Kanoj Sarcar; +Cc: Linus Torvalds, linux-mm

hi kanoj-

thanks for the good comments.

On Mon, 13 Mar 2000, Kanoj Sarcar wrote:
> #1
> >  static struct vm_operations_struct shm_vm_ops = {
> >  	open:	shm_open,	/* callback for a new vm-area open */
> >  	close:	shm_close,	/* callback for when the vm-area is released */
> > +	incore:	shm_incore,
> >  	nopage:	shm_nopage,
> >  	swapout:shm_swapout,
> >  };
> 
> shmzero_vm_ops should also probably have a incore function. /dev/zero is
> quite similar to shm, except the locking protocol is a little different
> (look at shmzero_nopage and shm_nopage), you should be able to seperate
> out the shm incore() function into a basic routine/#define that both shm
> and /dev/zero can use. Let me know if you need help with this.

i'll take a look at this.  although, it might be OK to assume that
/dev/zero pages are always in core, which simplifies shmzero_incore.

> #2. It wasn't very clear to me how MAP_ANON pages are being handled. Maybe
> I did not read the patch closely enough.

i'm assuming anonymously mapped pages get a vm_ops struct that has a NULL
for the incore function pointer.  i wasn't sure it is useful to ask the
question "is this anonymous page in memory?".  if it turns out that
applications need this, it is simple to add another function to do this.

> #3. If you have the time, it might make sense to pump out the #pages via
> /proc/pid/maps too (although I don't know whether that will break some
> apps that already know the output format).

i'm not exactly sure what you mean here.  what #pages value do you mean?

	- Chuck Lever
--
corporate:	<chuckl@netscape.com>
personal:	<chucklever@netscape.net> or <cel@monkey.org>

The Linux Scalability project:
	http://www.citi.umich.edu/projects/linux-scalability/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13 18:16   ` Chuck Lever
@ 2000-03-13 18:28     ` Kanoj Sarcar
  2000-03-13 18:35       ` Linus Torvalds
  0 siblings, 1 reply; 8+ messages in thread
From: Kanoj Sarcar @ 2000-03-13 18:28 UTC (permalink / raw)
  To: Chuck Lever; +Cc: Linus Torvalds, linux-mm

> 
> hi kanoj-
> 
> thanks for the good comments.
> 
> On Mon, 13 Mar 2000, Kanoj Sarcar wrote:
> > #1
> > >  static struct vm_operations_struct shm_vm_ops = {
> > >  	open:	shm_open,	/* callback for a new vm-area open */
> > >  	close:	shm_close,	/* callback for when the vm-area is released */
> > > +	incore:	shm_incore,
> > >  	nopage:	shm_nopage,
> > >  	swapout:shm_swapout,
> > >  };
> > 
> > shmzero_vm_ops should also probably have a incore function. /dev/zero is
> > quite similar to shm, except the locking protocol is a little different
> > (look at shmzero_nopage and shm_nopage), you should be able to seperate
> > out the shm incore() function into a basic routine/#define that both shm
> > and /dev/zero can use. Let me know if you need help with this.
> 
> i'll take a look at this.  although, it might be OK to assume that
> /dev/zero pages are always in core, which simplifies shmzero_incore.

/dev/zero pages might be out on swap too, similar to shm pages.

> 
> > #2. It wasn't very clear to me how MAP_ANON pages are being handled. Maybe
> > I did not read the patch closely enough.
> 
> i'm assuming anonymously mapped pages get a vm_ops struct that has a NULL
> for the incore function pointer.  i wasn't sure it is useful to ask the
> question "is this anonymous page in memory?".  if it turns out that
> applications need this, it is simple to add another function to do this.

Afaik, anon pages get a null vm_ops. Providing a default function for these
cases is probably not importatn right away.

> 
> > #3. If you have the time, it might make sense to pump out the #pages via
> > /proc/pid/maps too (although I don't know whether that will break some
> > apps that already know the output format).
> 
> i'm not exactly sure what you mean here.  what #pages value do you mean?

The vector that mincore() returns can also be reported via cat /proc/pid/maps,
right? Might be useful, but not neccesary ...

Kanoj
> 
> 	- Chuck Lever
> --
> corporate:	<chuckl@netscape.com>
> personal:	<chucklever@netscape.net> or <cel@monkey.org>
> 
> The Linux Scalability project:
> 	http://www.citi.umich.edu/projects/linux-scalability/
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux.eu.org/Linux-MM/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13 18:28     ` Kanoj Sarcar
@ 2000-03-13 18:35       ` Linus Torvalds
  2000-03-13 19:56         ` Chuck Lever
  0 siblings, 1 reply; 8+ messages in thread
From: Linus Torvalds @ 2000-03-13 18:35 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-mm


I don't like the "incore" thing.

I think that "incore" should be a generic VM function, and be based solely
on the VMA and the associated address space. 

The fact that the current shared memory implementation doesn't use address
spaces is an acknowledged bug and misfeature, not an excuse to perpetuate
the problem..

So I'd prefer something that does not have the "incore" function at all,
and if that convinces somebody else to change shm to use the address_space
stuff to get a working mincore(), all the better. Ok?

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13 18:35       ` Linus Torvalds
@ 2000-03-13 19:56         ` Chuck Lever
  2000-03-13 20:31           ` Linus Torvalds
  0 siblings, 1 reply; 8+ messages in thread
From: Chuck Lever @ 2000-03-13 19:56 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-mm

On Mon, 13 Mar 2000, Linus Torvalds wrote:
> I think that "incore" should be a generic VM function, and be based solely
> on the VMA and the associated address space. 

at one point i tried just walking the page tables, but that really didn't
give the results i wanted -- every page appeared to be "in core".

> So I'd prefer something that does not have the "incore" function at all,
> and if that convinces somebody else to change shm to use the address_space
> stuff to get a working mincore(), all the better. Ok?

hmm.  i created the "incore" method because mincore needs to synchronize
with the swapping method used for each of the different vma types.  this
is different for shm's vs. mapped files -- they both use locking methods
that are independent of one another.  any ideas about how to get around
this without using an "incore" vm_op?  do you think grabbing the mm
semaphor is enough?

i also wanted to check the page_uptodate bit for mapped files, but this
doesn't make sense for shm, for example.  i think the semantics of "page
is in memory" can be different enough for the different types of vmas that
having a separate hook for each is necessary.

btw i think i've ended up in your kill file.  direct mail i send to you
appears to be lost.

	- Chuck Lever
--
corporate:	<chuckl@netscape.com>
personal:	<chucklever@netscape.net> or <cel@monkey.org>

The Linux Scalability project:
	http://www.citi.umich.edu/projects/linux-scalability/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13 19:56         ` Chuck Lever
@ 2000-03-13 20:31           ` Linus Torvalds
  2000-03-13 21:32             ` Chuck Lever
  0 siblings, 1 reply; 8+ messages in thread
From: Linus Torvalds @ 2000-03-13 20:31 UTC (permalink / raw)
  To: Chuck Lever; +Cc: linux-mm


On Mon, 13 Mar 2000, Chuck Lever wrote:
> > So I'd prefer something that does not have the "incore" function at all,
> > and if that convinces somebody else to change shm to use the address_space
> > stuff to get a working mincore(), all the better. Ok?
> 
> hmm.  i created the "incore" method because mincore needs to synchronize
> with the swapping method used for each of the different vma types.  this
> is different for shm's vs. mapped files -- they both use locking methods
> that are independent of one another.

But that's exactly my poing. The shm version is bad, and it will be
eventually removed ;)

> btw i think i've ended up in your kill file.  direct mail i send to you
> appears to be lost.

You'r enot in my kill-file any more than anybody else is.

The fact that I get too much mail means that very few people get horribly
much attention, I'm afraid.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] mincore for i386, against 2.3.51
  2000-03-13 20:31           ` Linus Torvalds
@ 2000-03-13 21:32             ` Chuck Lever
  0 siblings, 0 replies; 8+ messages in thread
From: Chuck Lever @ 2000-03-13 21:32 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-mm

On Mon, 13 Mar 2000, Linus Torvalds wrote:
> On Mon, 13 Mar 2000, Chuck Lever wrote:
> > > So I'd prefer something that does not have the "incore" function at all,
> > > and if that convinces somebody else to change shm to use the address_space
> > > stuff to get a working mincore(), all the better. Ok?
> > 
> > hmm.  i created the "incore" method because mincore needs to synchronize
> > with the swapping method used for each of the different vma types.  this
> > is different for shm's vs. mapped files -- they both use locking methods
> > that are independent of one another.
> 
> But that's exactly my point. The shm version is bad, and it will be
> eventually removed ;)

ok, try this on for size.

diff -ruN Linux-2.3.51/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
--- Linux-2.3.51/arch/i386/kernel/entry.S	Sun Mar 12 18:42:20 2000
+++ linux/arch/i386/kernel/entry.S	Sun Mar 12 18:47:04 2000
@@ -638,6 +638,7 @@
 	.long SYMBOL_NAME(sys_setfsuid)		/* 215 */
 	.long SYMBOL_NAME(sys_setfsgid)
 	.long SYMBOL_NAME(sys_pivot_root)
+	.long SYMBOL_NAME(sys_mincore)
 
 
 	/*
@@ -646,6 +647,6 @@
 	 * entries. Don't panic if you notice that this hasn't
 	 * been shrunk every time we add a new system call.
 	 */
-	.rept NR_syscalls-217
+	.rept NR_syscalls-218
 		.long SYMBOL_NAME(sys_ni_syscall)
 	.endr
diff -ruN Linux-2.3.51/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h
--- Linux-2.3.51/include/asm-i386/unistd.h	Wed Jan 26 15:32:02 2000
+++ linux/include/asm-i386/unistd.h	Mon Mar 13 15:52:08 2000
@@ -222,6 +222,7 @@
 #define __NR_setfsuid32		215
 #define __NR_setfsgid32		216
 #define __NR_pivot_root		217
+#define __NR_mincore		218
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff -ruN Linux-2.3.51/mm/filemap.c linux/mm/filemap.c
--- Linux-2.3.51/mm/filemap.c	Sun Mar 12 18:42:48 2000
+++ linux/mm/filemap.c	Mon Mar 13 16:13:58 2000
@@ -1727,6 +1727,160 @@
 	return error;
 }
 
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct vm_area_struct * vma,
+	unsigned long pgoff)
+{
+	unsigned char present = 0;
+	struct address_space * as = &vma->vm_file->f_dentry->d_inode->i_data;
+	struct page * page, ** hash = page_hash(as, pgoff);
+
+	spin_lock(&pagecache_lock);
+	page = __find_page_nolock(as, pgoff, *hash);
+	if ((page) && (Page_Uptodate(page)))
+		present = 1;
+	spin_unlock(&pagecache_lock);
+
+	return present;
+}
+
+static long mincore_vma(struct vm_area_struct * vma,
+	unsigned long start, unsigned long end, unsigned char * vec)
+{
+	long error, i, remaining;
+	unsigned char * tmp;
+
+	error = -ENOMEM;
+	if (!vma->vm_file)
+		return error;
+
+	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	if (end > vma->vm_end)
+		end = vma->vm_end;
+	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+
+	error = -EAGAIN;
+	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
+	if (!tmp)
+		return error;
+
+	/* (end - start) is # of pages, and also # of bytes in "vec */
+	remaining = (end - start),
+
+	error = 0;
+	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
+		int j = 0;
+		long thispiece = (remaining < PAGE_SIZE) ?
+						remaining : PAGE_SIZE;
+
+		while (j < thispiece)
+			tmp[j++] = mincore_page(vma, start++);
+
+		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
+			error = -EFAULT;
+			break;
+		}
+	}
+
+	free_page((unsigned long) tmp);
+	return error;
+}
+
+/*
+ * The mincore(2) system call.
+ *
+ * mincore() returns the memory residency status of the pages in the
+ * current process's address space specified by [addr, addr + len).
+ * The status is returned in a vector of bytes.  The least significant
+ * bit of each byte is 1 if the referenced page is in memory, otherwise
+ * it is zero.
+ *
+ * Because the status of a page can change after mincore() checks it
+ * but before it returns to the application, the returned vector may
+ * contain stale information.  Only locked pages are guaranteed to
+ * remain in memory.
+ *
+ * return values:
+ *  zero    - success
+ *  -EFAULT - vec points to an illegal address
+ *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
+ *		or len has a nonpositive value
+ *  -ENOMEM - Addresses in the range [addr, addr + len] are
+ *		invalid for the address space of this process, or
+ *		specify one or more pages which are not currently
+ *		mapped
+ *  -EAGAIN - A kernel resource was temporarily unavailable.
+ */
+asmlinkage long sys_mincore(unsigned long start, size_t len,
+	unsigned char * vec)
+{
+	int index = 0;
+	unsigned long end;
+	struct vm_area_struct * vma;
+	int unmapped_error = 0;
+	long error = -EINVAL;
+
+	down(&current->mm->mmap_sem);
+
+	if (start & ~PAGE_MASK)
+		goto out;
+	len = (len + ~PAGE_MASK) & PAGE_MASK;
+	end = start + len;
+	if (end < start)
+		goto out;
+
+	error = 0;
+	if (end == start)
+		goto out;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 */
+	vma = find_vma(current->mm, start);
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			goto out;
+
+		/* Here start < vma->vm_end. */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end. */
+		if (end <= vma->vm_end) {
+			if (start < end) {
+				error = mincore_vma(vma, start, end,
+							&vec[index]);
+				if (error)
+					goto out;
+			}
+			error = unmapped_error;
+			goto out;
+		}
+
+		/* Here vma->vm_start <= start < vma->vm_end < end. */
+		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
+		if (error)
+			goto out;
+		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+out:
+	up(&current->mm->mmap_sem);
+	return error;
+}
+
 struct page *read_cache_page(struct address_space *mapping,
 				unsigned long index,
 				int (*filler)(void *,struct page*),

	- Chuck Lever
--
corporate:	<chuckl@netscape.com>
personal:	<chucklever@netscape.net> or <cel@monkey.org>

The Linux Scalability project:
	http://www.citi.umich.edu/projects/linux-scalability/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux.eu.org/Linux-MM/

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2000-03-13 21:32 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2000-03-13  0:45 [PATCH] mincore for i386, against 2.3.51 Chuck Lever
2000-03-13 17:46 ` Kanoj Sarcar
2000-03-13 18:16   ` Chuck Lever
2000-03-13 18:28     ` Kanoj Sarcar
2000-03-13 18:35       ` Linus Torvalds
2000-03-13 19:56         ` Chuck Lever
2000-03-13 20:31           ` Linus Torvalds
2000-03-13 21:32             ` Chuck Lever

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox