[PATCH 1/2] mm: page cache mempolicy for page cache allocation

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] mm: page cache mempolicy for page cache allocation
@ 2004-09-23 18:56 Ray Bryant
  0 siblings, 0 replies; 4+ messages in thread
From: Ray Bryant @ 2004-09-23 18:56 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ray Bryant, William Lee Irwin III, Andrew Morton, linux-mm,
	Ray Bryant, Jesse Barnes, Dan Higgins, Dave Hansen, lse-tech,
	Brent Casavant, Martin J. Bligh, linux-kernel, Ray Bryant,
	Paul Jackson, Nick Piggin

Andi Kleene <ak@suse.de> wrote:

>Overall when I look at all the complications you add for the per process
>page policy which doesn't even have a demonstrated need I'm not sure
>it is really worth it.

Andi,

I guess the only argument I can make is that if I special case the page
cache allocation policy to not have a per process component, I'm either
going to have to create a separate set of stuff to get/set/use it, or I am
going to have to gunk up the existing code with logic like the following:

struct page *
alloc_pages_by_policy(unsigned gfp, unsigned order, unsigned policy)
{
	struct mempolicy *pol;
 
	if (policy >= NR_MEM_POLICIES)
		BUG();
+	if (policy == 0)
+ 		pol = current->mempolicy;
	if (!pol)
		pol = default_policy[policy];
	. . .

All in all, >>I<< think it is a wash either way, but given that I
can't point at an application that uses this requirement, I can't make
a strong argument.  I would observe again that a file server process
on a big HPC machine would likely want to have a different page cache
allocation policy than the HPC applications, but you could get the same
effect by creating a single node cpuset to hold the file server process.

(If we do find such an application, it is going to result in an API
change, assuming we don't support a per process page cache replacement
policy at the present time.)

(Also, what are we going to do if some OTHER policy class comes along
that does have a justifiable need for a per process override? To keep
all of this straight is going to be a mess.)

Just for comparison, I did a patch that removes the per process page
cache policy and annotated it with the changes.  (This patch is on
top of the previous 2 patches I sent.)  This patch can be found below.
Removing support for the per process page cache policy results in a net
change of one line (total) less code; it results in 8 changed lines, most
of these are such things as removing the subscript on current->mempolicy.

Given the above, if you still prefer no per process page cache allocation
policy, I'll merge this patch into the page cache policy patch and send
it out.

(I'm also asking around to see if I can find a suitable justification
for this general per process mempolicy stuff.)

I'll hold off sending out a new version of the patch that includes your
other suggestions until I hear back on this.

======================================================================
Index: linux-2.6.9-rc2-mm1/mm/mempolicy.c
===================================================================
--- linux-2.6.9-rc2-mm1.orig/mm/mempolicy.c	2004-09-23 10:52:34.000000000 -0700
+++ linux-2.6.9-rc2-mm1/mm/mempolicy.c	2004-09-23 11:03:33.000000000 -0700
@@ -418,6 +417,8 @@ asmlinkage long sys_set_mempolicy(int re
 
 	if ((mode > MPOL_MAX) || (policy >= NR_MEM_POLICIES))
 		return -EINVAL;
+	if (!request_policy_default && (policy > 0))            /* process add 2 */
+		return -EINVAL;
 	if (request_policy_default && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	err = get_nodes(nodes, nmask, maxnode, mode);
@@ -430,8 +431,8 @@ asmlinkage long sys_set_mempolicy(int re
 		mpol_free(default_policy[policy]);
 		default_policy[policy] = new;
 	} else {
-		mpol_free(current->mempolicy[policy]);
-		current->mempolicy[policy] = new;
+		mpol_free(current->mempolicy);			/* process change 2 */
+		current->mempolicy = new;
 	}
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = current->pid % MAX_NUMNODES;
@@ -521,9 +522,7 @@ asmlinkage long sys_get_mempolicy(int __
 		goto copy_policy_to_user;
 	}
 	if (policy_type>0) {
-		pol = current->mempolicy[policy_type];
-		if (!pol)
-			pol = default_policy[policy_type];
+		pol = default_policy[policy_type];		/* process del 2 */
 		goto copy_policy_to_user;
 	}
 
@@ -550,7 +549,7 @@ asmlinkage long sys_get_mempolicy(int __
 			if (err < 0)
 				goto out;
 			pval = err;
-		} else if (pol == current->mempolicy[policy_type] &&
+		} else if (pol == current->mempolicy &&		/* process change 1 */
 				pol->policy == MPOL_INTERLEAVE) {
 			pval = current->il_next;
 		} else {
@@ -662,7 +661,7 @@ asmlinkage long compat_mbind(compat_ulon
 static struct mempolicy *
 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = current->mempolicy[POLICY_PAGE];
+	struct mempolicy *pol = current->mempolicy;		/* process change 1 */
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -831,7 +830,8 @@ alloc_pages_by_policy(unsigned gfp, unsi
   
  	if (policy >= NR_MEM_POLICIES)
  		BUG();
- 	pol = current->mempolicy[policy];
+ 	if (policy == POLICY_PAGE)				/* process add 1 */
+ 		pol = current->mempolicy;                       /* process change 1 */
  	if (!pol)
  		pol = default_policy[policy];
 	if (!in_interrupt())
Index: linux-2.6.9-rc2-mm1/kernel/fork.c
===================================================================
--- linux-2.6.9-rc2-mm1.orig/kernel/fork.c	2004-09-23 10:52:34.000000000 -0700
+++ linux-2.6.9-rc2-mm1/kernel/fork.c	2004-09-23 10:58:07.000000000 -0700
@@ -865,14 +865,12 @@ static task_t *copy_process(unsigned lon
 	p->io_wait = NULL;
 	p->audit_context = NULL;
 #ifdef CONFIG_NUMA
-	for(i=0;i<NR_MEM_POLICIES;i++) {
-		p->mempolicy[i] = mpol_copy(p->mempolicy[i]);
-		if (IS_ERR(p->mempolicy[i])) {
-			retval = PTR_ERR(p->mempolicy[i]);
-			p->mempolicy[i] = NULL;
-			goto bad_fork_cleanup;
-		}
-	}
+ 	p->mempolicy = mpol_copy(p->mempolicy);		/* process del 2 */
+ 	if (IS_ERR(p->mempolicy)) {                     /* process change 3 */
+ 		retval = PTR_ERR(p->mempolicy);
+ 		p->mempolicy = NULL;
+ 		goto bad_fork_cleanup;
+ 	}
 	/* randomize placement of first page across nodes */
 	p->il_next = p->pid % MAX_NUMNODES;
 #endif
@@ -1042,8 +1040,7 @@ bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
-	for(i=0;i<NR_MEM_POLICIES;i++)
-		mpol_free(p->mempolicy[i]);
+	mpol_free(p->mempolicy);		/* process del 1 */
 #endif
 bad_fork_cleanup:
 	if (p->binfmt)
Index: linux-2.6.9-rc2-mm1/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc2-mm1.orig/include/linux/sched.h	2004-09-23 10:45:48.000000000 -0700
+++ linux-2.6.9-rc2-mm1/include/linux/sched.h	2004-09-23 11:04:33.000000000 -0700
@@ -30,7 +30,7 @@
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
-#include <linux/mempolicy.h>
+// #include <linux/mempolicy.h>		/* process del 1 */
 
 struct exec_domain;
 
@@ -743,7 +743,7 @@ struct task_struct {
  */
 	wait_queue_t *io_wait;
 #ifdef CONFIG_NUMA
-  	struct mempolicy *mempolicy[NR_MEM_POLICIES];
+  	struct mempolicy *mempolicy;		/* process change 1 */
   	short il_next;		/* could be shared with used_math */
 #endif
 #ifdef CONFIG_CPUSETS
Index: linux-2.6.9-rc2-mm1/kernel/exit.c
===================================================================
--- linux-2.6.9-rc2-mm1.orig/kernel/exit.c	2004-09-23 10:42:02.000000000 -0700
+++ linux-2.6.9-rc2-mm1/kernel/exit.c	2004-09-23 10:57:42.000000000 -0700
@@ -831,10 +831,8 @@ asmlinkage NORET_TYPE void do_exit(long 
 	tsk->exit_code = code;
 	exit_notify(tsk);
 #ifdef CONFIG_NUMA
-	for (i=0; i<NR_MEM_POLICIES; i++) {
-		mpol_free(tsk->mempolicy[i]);
-		tsk->mempolicy[i] = NULL;
-	}
+	mpol_free(tsk->mempolicy);		/* process del 2 */
+	tsk->mempolicy = NULL;			/* process change 2 */
 #endif
 	schedule();
 	BUG();
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/2] mm: page cache mempolicy for page cache allocation
  2004-09-23  9:24   ` Andi Kleen
@ 2004-09-24  4:12     ` Ray Bryant
  0 siblings, 0 replies; 4+ messages in thread
From: Ray Bryant @ 2004-09-24  4:12 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Ray Bryant, William Lee Irwin III, Andrew Morton, linux-mm,
	Jesse Barnes, Dan Higgins, Dave Hansen, lse-tech, Brent Casavant,
	Martin J. Bligh, linux-kernel, Paul Jackson, Nick Piggin

Andi Kleen wrote:

> 
> Overall when I look at all the complications you add for the per process
> page policy which doesn't even have a demonstrated need I'm not sure
> it is really worth it.
>

Polling people inside of SGI, they seem to think that a per file memory policy
is a good thing, but it needs to be settable from outside the process without
changing the header or code of the process (think of an ISV application that
we want to run on Altix.)  I can't quite get my head around what that means
(do you have to specify this externally based on the order that files are
opened in [e. g. file 1 has policy this, file 2 has policy that, etc] or does
one specify this by type of file [text, mapped file, etc]).  Does this end up
being effectively a per process policy with a per file override?  (e. g. all
files for this process are managed with policy "this", except for the 5th file
opened [or whatever] and it has policy "that".)

Steve -- how does your MTA design handle this?

Anyway, I'm about to throw in the towel on the per process page cache memory
policy.  I can't make a strong enough argument for it.

I assume that is acceptable, Andi?  :-)
-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/2] mm: page cache mempolicy for page cache allocation
  2004-09-23  4:32 ` [PATCH 1/2] mm: page cache mempolicy " Ray Bryant
@ 2004-09-23  9:24   ` Andi Kleen
  2004-09-24  4:12     ` Ray Bryant
  0 siblings, 1 reply; 4+ messages in thread
From: Andi Kleen @ 2004-09-23  9:24 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Andi Kleen, William Lee Irwin III, Andrew Morton, linux-mm,
	Jesse Barnes, Dan Higgins, Dave Hansen, lse-tech, Brent Casavant,
	Martin J. Bligh, linux-kernel, Ray Bryant, Paul Jackson,
	Nick Piggin

> +/* policy selection bits are passed from user shifted left by this amount */
> +#define REQUEST_POLICY_SHIFT	16
> +#define REQUEST_POLICY_PAGE     POLICY_PAGE << REQUEST_POLICY_SHIFT
> +#define REQUEST_POLICY_PAGECACHE POLICY_PAGECACHE << REQUEST_POLICY_SHIFT
> +#define REQUEST_POLICY_MASK     (0x3FFF) << REQUEST_POLICY_SHIFT

Please put brackets around the macros. Putting them around numbers
is not needed though @)


> +#define REQUEST_POLICY_DEFAULT  (0x8000) << REQUEST_POLICY_SHIFT
> +
>  /* Flags for get_mem_policy */
>  #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
>  #define MPOL_F_ADDR	(1<<1)	/* look up vma using address */
> @@ -31,6 +54,8 @@
>  #include <linux/slab.h>
>  #include <linux/rbtree.h>
>  #include <asm/semaphore.h>
> +#include <linux/sched.h>
> +#include <asm/current.h>

Why is that needed? I don't see any users for this.  Please avoid this 
if possible, we already have too much include dependency spagetti.


> --- linux-2.6.9-rc2-mm1.orig/include/linux/sched.h	2004-09-16 12:54:41.000000000 -0700
> +++ linux-2.6.9-rc2-mm1/include/linux/sched.h	2004-09-22 08:48:45.000000000 -0700
> @@ -31,6 +31,8 @@
>  #include <linux/pid.h>
>  #include <linux/percpu.h>
>  
> +#include <linux/mempolicy.h>

I also don't see why this should be needed. Please remove.

> +	for(i=0;i<NR_MEM_POLICIES;i++)

There should be more spaces here (similar in other loops) 


>  	int err, pval;
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma = NULL;
> -	struct mempolicy *pol = current->mempolicy;
> +	struct mempolicy *pol = NULL;
> +	int policy_type, request_policy_default;
>  
>  	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
>  		return -EINVAL;
>  	if (nmask != NULL && maxnode < numnodes)
>  		return -EINVAL;
> +
> +	policy_type = (flags & REQUEST_POLICY_MASK) > REQUEST_POLICY_SHIFT;
> +	request_policy_default = (flags & REQUEST_POLICY_DEFAULT);

Why is that not an MPOL_F_* ? 

>  /* Slow path of a mempolicy copy */
>  struct mempolicy *__mpol_copy(struct mempolicy *old)
> @@ -1093,8 +1146,8 @@ void __init numa_policy_init(void)
>  	/* Set interleaving policy for system init. This way not all
>  	   the data structures allocated at system boot end up in node zero. */
>  
> -	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
> -							MAX_NUMNODES) < 0)
> +	if (sys_set_mempolicy(REQUEST_POLICY_PAGE | MPOL_INTERLEAVE, 
> +		nodes_addr(node_online_map), MAX_NUMNODES) < 0)

That's definitely wrong, the boot time interleaving is not for the page
cache but for all allocations. There are not even page cache allocations
that early.

Overall when I look at all the complications you add for the per process
page policy which doesn't even have a demonstrated need I'm not sure
it is really worth it.

>  		printk("numa_policy_init: interleaving failed\n");
>  }
>  
> @@ -1102,5 +1155,5 @@ void __init numa_policy_init(void)
>   * Assumes fs == KERNEL_DS */
>  void numa_default_policy(void)
>  {
> -	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
> +	sys_set_mempolicy(REQUEST_POLICY_PAGE | MPOL_DEFAULT, NULL, 0);

Same.

-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/2] mm: page cache mempolicy for page cache allocation
  2004-09-23  4:32 [PATCH 0/2] mm: memory policy " Ray Bryant
@ 2004-09-23  4:32 ` Ray Bryant
  2004-09-23  9:24   ` Andi Kleen
  0 siblings, 1 reply; 4+ messages in thread
From: Ray Bryant @ 2004-09-23  4:32 UTC (permalink / raw)
  To: Andi Kleen
  Cc: William Lee Irwin III, Andrew Morton, linux-mm, Jesse Barnes,
	Dan Higgins, Dave Hansen, lse-tech, Brent Casavant, Ray Bryant,
	Martin J. Bligh, linux-kernel, Ray Bryant, Paul Jackson,
	Nick Piggin

This is version 2 of the page cache memory policy patch.

Changes from the previous version:

(1)  This patch no longer requires MPOL_ROUNDROBIN so that patch
     has been deleted from this series.

(2)  This patch provides a mechanism for setting and getting
     not only the process's policies for allocating pages and
     page cache (if any), but also for getting and setting the
     system-wide default policies for these allocations.  (Admin
     capabaility is required to set the default policies.)
     Specification of which policy to set and whether it is 
     the page allocation policy or the page cache allocation
     policy is done in the upper bits of the first argument to
     sys_set_mempolicy() and in the flags argument of
     sys_get_mempolicy().  These values are defined so that
     existing users will not see a change.

     See sys_set_mempolicy(), sys_get_mempolicy() and
     include/linux/mempolicy.h for further details.

     It is expected that the default policies will be set during
     boot processing of startup scripts and will not be changed
     thereafter (without quiescing the system and/or flushing the
     page cache).

(3)  This patch uses the existing infrastructure from the 
     the previous version of alloc_pages_current() to do the
     round robin allocation of page cache pages across nodes
     if the page cache allocation policy is MPOL_INTERLEAVE.
     That is, this patch uses current->il_next and
     interleave_node() to decide what node to allocate the
     current page on. 

     This means that regular pages and page cache pages are
     allocated using the same "rotator" if both policies are
     MPOL_INTERLEAVE and avoids having to pass an offset,
     a dev_t, and an inode into page_cache_alloc().

Signed-off-by: Ray Bryant <raybry@sgi.com>

Index: linux-2.6.9-rc2-mm1/include/linux/gfp.h
===================================================================
--- linux-2.6.9-rc2-mm1.orig/include/linux/gfp.h	2004-09-16 12:54:27.000000000 -0700
+++ linux-2.6.9-rc2-mm1/include/linux/gfp.h	2004-09-22 08:48:44.000000000 -0700
@@ -92,7 +92,22 @@ static inline struct page *alloc_pages_n
 }
 
 #ifdef CONFIG_NUMA
-extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order);
+extern struct page *alloc_pages_by_policy(unsigned gfp, unsigned order, 
+	unsigned policy);
+
+static inline
+struct page *alloc_pages_current(unsigned gfp, unsigned order)
+{
+	/* 
+	 * include order keeps us from including mempolicy.h here
+	 * the following should be:
+	 *    return alloc_pages_by_policy(gfp, order, POLICY_PAGE);
+	 * but POLICY_PAGE is not defined yet.
+	 * We assume here that POLICY_PAGE is defined to be 0
+	 * See include/linux/mempolicy.h.
+	 */
+	return alloc_pages_by_policy(gfp, order, 0);
+}
 
 static inline struct page *
 alloc_pages(unsigned int gfp_mask, unsigned int order)
Index: linux-2.6.9-rc2-mm1/include/linux/mempolicy.h
===================================================================
--- linux-2.6.9-rc2-mm1.orig/include/linux/mempolicy.h	2004-09-16 10:41:23.000000000 -0700
+++ linux-2.6.9-rc2-mm1/include/linux/mempolicy.h	2004-09-22 08:48:44.000000000 -0700
@@ -16,6 +16,29 @@
 
 #define MPOL_MAX MPOL_INTERLEAVE
 
+/* 
+ * Policy indicies
+ * These specify the index into either the task->mempolicy array or the
+ * default_policy array to indicate which policy is to be used for a
+ * particular allocation.
+ */
+#define NR_MEM_POLICIES 	2
+/* policy to use for page allocation and the default kernel policy */
+/* this value is hard coded into alloc_pages() in gfp.h do not change it */
+#define POLICY_PAGE		0
+/* policy to use for pagecache allocation */
+#define POLICY_PAGECACHE 	1
+
+/* policy selection bits are passed from user shifted left by this amount */
+#define REQUEST_POLICY_SHIFT	16
+#define REQUEST_POLICY_PAGE     POLICY_PAGE << REQUEST_POLICY_SHIFT
+#define REQUEST_POLICY_PAGECACHE POLICY_PAGECACHE << REQUEST_POLICY_SHIFT
+#define REQUEST_POLICY_MASK     (0x3FFF) << REQUEST_POLICY_SHIFT
+#define REQUEST_MODE_MASK       (0xFFFF)
+/* by default, user requests are for the process policy -- this flag 
+ * informs sys_set_policy() that this request is for the default policy */
+#define REQUEST_POLICY_DEFAULT  (0x8000) << REQUEST_POLICY_SHIFT
+
 /* Flags for get_mem_policy */
 #define MPOL_F_NODE	(1<<0)	/* return next IL mode instead of node mask */
 #define MPOL_F_ADDR	(1<<1)	/* look up vma using address */
@@ -31,6 +54,8 @@
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <asm/semaphore.h>
+#include <linux/sched.h>
+#include <asm/current.h>
 
 struct vm_area_struct;
 
@@ -68,6 +93,9 @@ struct mempolicy {
 	} v;
 };
 
+extern struct page *
+alloc_pages_by_policy(unsigned gfp, unsigned order, unsigned int policy);
+
 /*
  * Support for managing mempolicy data objects (clone, copy, destroy)
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
Index: linux-2.6.9-rc2-mm1/include/linux/pagemap.h
===================================================================
--- linux-2.6.9-rc2-mm1.orig/include/linux/pagemap.h	2004-09-16 12:54:19.000000000 -0700
+++ linux-2.6.9-rc2-mm1/include/linux/pagemap.h	2004-09-22 08:48:45.000000000 -0700
@@ -50,6 +50,7 @@ static inline void mapping_set_gfp_mask(
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+#ifndef CONFIG_NUMA
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
 	return alloc_pages(mapping_gfp_mask(x), 0);
@@ -59,6 +60,30 @@ static inline struct page *page_cache_al
 {
 	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
 }
+#define page_cache_alloc_local((x)) page_cache_alloc((x))
+#else /* CONFIG_NUMA */
+
+struct mempolicy;
+extern struct mempolicy *default_policy[];
+extern struct page *
+alloc_pages_by_policy(unsigned gfp, unsigned order, unsigned policy);
+
+static inline struct page *page_cache_alloc_local(struct address_space *x)
+{
+	return alloc_pages(mapping_gfp_mask(x), 0);
+}
+
+static inline struct page *page_cache_alloc(struct address_space *x)
+{
+	return alloc_pages_by_policy(mapping_gfp_mask(x), 0, POLICY_PAGECACHE);
+}
+
+static inline struct page *page_cache_alloc_cold(struct address_space *x)
+{
+	return alloc_pages_by_policy(mapping_gfp_mask(x)|__GFP_COLD, 0, 
+		POLICY_PAGECACHE);
+}
+#endif
 
 typedef int filler_t(void *, struct page *);
 
Index: linux-2.6.9-rc2-mm1/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc2-mm1.orig/include/linux/sched.h	2004-09-16 12:54:41.000000000 -0700
+++ linux-2.6.9-rc2-mm1/include/linux/sched.h	2004-09-22 08:48:45.000000000 -0700
@@ -31,6 +31,8 @@
 #include <linux/pid.h>
 #include <linux/percpu.h>
 
+#include <linux/mempolicy.h>
+
 struct exec_domain;
 
 /*
@@ -588,7 +590,6 @@ int set_current_groups(struct group_info
 
 
 struct audit_context;		/* See audit.c */
-struct mempolicy;
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -743,7 +744,7 @@ struct task_struct {
  */
 	wait_queue_t *io_wait;
 #ifdef CONFIG_NUMA
-  	struct mempolicy *mempolicy;
+  	struct mempolicy *mempolicy[NR_MEM_POLICIES];
   	short il_next;		/* could be shared with used_math */
 #endif
 #ifdef CONFIG_CPUSETS
Index: linux-2.6.9-rc2-mm1/kernel/exit.c
===================================================================
--- linux-2.6.9-rc2-mm1.orig/kernel/exit.c	2004-09-16 12:54:32.000000000 -0700
+++ linux-2.6.9-rc2-mm1/kernel/exit.c	2004-09-22 08:48:45.000000000 -0700
@@ -785,6 +785,7 @@ static void exit_notify(struct task_stru
 asmlinkage NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	int i;
 
 	profile_task_exit(tsk);
 
@@ -830,8 +831,10 @@ asmlinkage NORET_TYPE void do_exit(long 
 	tsk->exit_code = code;
 	exit_notify(tsk);
 #ifdef CONFIG_NUMA
-	mpol_free(tsk->mempolicy);
-	tsk->mempolicy = NULL;
+	for(i=0;i<NR_MEM_POLICIES;i++) {
+		mpol_free(tsk->mempolicy[i]);
+		tsk->mempolicy[i] = NULL;
+	}
 #endif
 	schedule();
 	BUG();
Index: linux-2.6.9-rc2-mm1/kernel/fork.c
===================================================================
--- linux-2.6.9-rc2-mm1.orig/kernel/fork.c	2004-09-22 08:08:18.000000000 -0700
+++ linux-2.6.9-rc2-mm1/kernel/fork.c	2004-09-22 08:48:45.000000000 -0700
@@ -776,7 +776,7 @@ static task_t *copy_process(unsigned lon
 				 int __user *child_tidptr,
 				 int pid)
 {
-	int retval;
+	int retval, i;
 	struct task_struct *p = NULL;
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
@@ -865,12 +865,14 @@ static task_t *copy_process(unsigned lon
 	p->io_wait = NULL;
 	p->audit_context = NULL;
 #ifdef CONFIG_NUMA
- 	p->mempolicy = mpol_copy(p->mempolicy);
- 	if (IS_ERR(p->mempolicy)) {
- 		retval = PTR_ERR(p->mempolicy);
- 		p->mempolicy = NULL;
- 		goto bad_fork_cleanup;
- 	}
+	for(i=0;i<NR_MEM_POLICIES;i++) {
+		p->mempolicy[i] = mpol_copy(p->mempolicy[i]);
+		if (IS_ERR(p->mempolicy[i])) {
+			retval = PTR_ERR(p->mempolicy[i]);
+			p->mempolicy[i] = NULL;
+			goto bad_fork_cleanup;
+		}
+	}
 #endif
 
 	p->tgid = p->pid;
@@ -1038,7 +1040,8 @@ bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
-	mpol_free(p->mempolicy);
+	for(i=0;i<NR_MEM_POLICIES;i++)
+		mpol_free(p->mempolicy[i]);
 #endif
 bad_fork_cleanup:
 	if (p->binfmt)
Index: linux-2.6.9-rc2-mm1/mm/mempolicy.c
===================================================================
--- linux-2.6.9-rc2-mm1.orig/mm/mempolicy.c	2004-09-16 12:54:20.000000000 -0700
+++ linux-2.6.9-rc2-mm1/mm/mempolicy.c	2004-09-22 11:46:20.000000000 -0700
@@ -87,11 +87,27 @@ static kmem_cache_t *sn_cache;
    policied. */
 static int policy_zone;
 
-static struct mempolicy default_policy = {
+/*
+ * the default policies for page allocation, page cache allocation
+ */
+static struct mempolicy default_kernel_mempolicy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
 	.policy = MPOL_DEFAULT,
 };
 
+struct mempolicy default_pagecache_mempolicy = {
+	.refcnt  = ATOMIC_INIT(1), /* never free it */
+	.policy  = MPOL_DEFAULT,
+};
+
+/*
+ * references to the default policies are via indexes into this array
+ */
+struct mempolicy *default_policy[NR_MEM_POLICIES] = {
+		&default_kernel_mempolicy, 
+		&default_pagecache_mempolicy,
+};
+
 /* Check if all specified nodes are online */
 static int nodes_online(unsigned long *nodes)
 {
@@ -389,23 +405,34 @@ asmlinkage long sys_mbind(unsigned long 
 }
 
 /* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+asmlinkage long sys_set_mempolicy(int request, unsigned long __user *nmask,
 				   unsigned long maxnode)
 {
-	int err;
+	int err, mode, policy, request_policy_default;
 	struct mempolicy *new;
 	DECLARE_BITMAP(nodes, MAX_NUMNODES);
 
-	if (mode > MPOL_MAX)
+	mode = request & REQUEST_MODE_MASK;
+	policy = (request & REQUEST_POLICY_MASK) >> REQUEST_POLICY_SHIFT;
+	request_policy_default= request & REQUEST_POLICY_DEFAULT;
+
+	if ((mode > MPOL_MAX) || (policy >= NR_MEM_POLICIES))
 		return -EINVAL;
+	if (request_policy_default && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
 	err = get_nodes(nodes, nmask, maxnode, mode);
 	if (err)
 		return err;
 	new = mpol_new(mode, nodes);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
-	mpol_free(current->mempolicy);
-	current->mempolicy = new;
+	if (request_policy_default) {
+		mpol_free(default_policy[policy]);
+		default_policy[policy] = new;
+	} else {
+		mpol_free(current->mempolicy[policy]);
+		current->mempolicy[policy] = new;
+	}
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
 	return 0;
@@ -477,12 +504,29 @@ asmlinkage long sys_get_mempolicy(int __
 	int err, pval;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol = NULL;
+	int policy_type, request_policy_default;
 
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (nmask != NULL && maxnode < numnodes)
 		return -EINVAL;
+
+	policy_type = (flags & REQUEST_POLICY_MASK) > REQUEST_POLICY_SHIFT;
+	request_policy_default = (flags & REQUEST_POLICY_DEFAULT);
+	if (policy_type >= NR_MEM_POLICIES)
+		return -EINVAL;
+	if (request_policy_default) {
+		pol = default_policy[policy_type];
+		goto copy_policy_to_user;
+	}
+	if (policy_type>0) {
+		pol = current->mempolicy[policy_type];
+		if (!pol)
+			pol = default_policy[policy_type];
+		goto copy_policy_to_user;
+	}
+
 	if (flags & MPOL_F_ADDR) {
 		down_read(&mm->mmap_sem);
 		vma = find_vma_intersection(mm, addr, addr+1);
@@ -498,7 +542,7 @@ asmlinkage long sys_get_mempolicy(int __
 		return -EINVAL;
 
 	if (!pol)
-		pol = &default_policy;
+		pol = default_policy[policy_type];
 
 	if (flags & MPOL_F_NODE) {
 		if (flags & MPOL_F_ADDR) {
@@ -506,7 +550,7 @@ asmlinkage long sys_get_mempolicy(int __
 			if (err < 0)
 				goto out;
 			pval = err;
-		} else if (pol == current->mempolicy &&
+		} else if (pol == current->mempolicy[policy_type] &&
 				pol->policy == MPOL_INTERLEAVE) {
 			pval = current->il_next;
 		} else {
@@ -520,6 +564,7 @@ asmlinkage long sys_get_mempolicy(int __
 	if (policy && put_user(pval, policy))
 		goto out;
 
+copy_policy_to_user:
 	err = 0;
 	if (nmask) {
 		DECLARE_BITMAP(nodes, MAX_NUMNODES);
@@ -538,7 +583,8 @@ asmlinkage long sys_get_mempolicy(int __
 asmlinkage long compat_get_mempolicy(int __user *policy,
 				     compat_ulong_t __user *nmask,
 				     compat_ulong_t maxnode,
-				     compat_ulong_t addr, compat_ulong_t flags)
+				     compat_ulong_t addr, compat_ulong_t flags,
+				     compat_uint_t policy_index)
 {
 	long err;
 	unsigned long __user *nm = NULL;
@@ -616,7 +662,7 @@ asmlinkage long compat_mbind(compat_ulon
 static struct mempolicy *
 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol = current->mempolicy[POLICY_PAGE];
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -626,7 +672,7 @@ get_vma_policy(struct vm_area_struct *vm
 			pol = vma->vm_policy;
 	}
 	if (!pol)
-		pol = &default_policy;
+		pol = default_policy[POLICY_PAGE];
 	return pol;
 }
 
@@ -758,7 +804,7 @@ alloc_page_vma(unsigned gfp, struct vm_a
 }
 
 /**
- * 	alloc_pages_current - Allocate pages.
+ * 	alloc_pages_by_policy - Allocate pages using a given mempolicy
  *
  *	@gfp:
  *		%GFP_USER   user allocation,
@@ -767,24 +813,31 @@ alloc_page_vma(unsigned gfp, struct vm_a
  *      	%GFP_FS     don't call back into a file system.
  *      	%GFP_ATOMIC don't sleep.
  *	@order: Power of two of allocation size in pages. 0 is a single page.
+ *	@policy:Index of the mempolicy struct to use for this allocation
  *
  *	Allocate a page from the kernel page pool.  When not in
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  */
-struct page *alloc_pages_current(unsigned gfp, unsigned order)
+struct page *
+alloc_pages_by_policy(unsigned gfp, unsigned order, unsigned policy)
 {
-	struct mempolicy *pol = current->mempolicy;
-
+ 	struct mempolicy *pol;
+  
+ 	if (policy >= NR_MEM_POLICIES)
+ 		BUG();
+ 	pol = current->mempolicy[policy];
+ 	if (!pol)
+ 		pol = default_policy[policy];
 	if (!in_interrupt())
 		cpuset_update_current_mems_allowed();
 	if (!pol || in_interrupt())
-		pol = &default_policy;
+		pol = default_policy[policy];
 	if (pol->policy == MPOL_INTERLEAVE)
 		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 }
-EXPORT_SYMBOL(alloc_pages_current);
+EXPORT_SYMBOL(alloc_pages_by_policy);
 
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
@@ -1093,8 +1146,8 @@ void __init numa_policy_init(void)
 	/* Set interleaving policy for system init. This way not all
 	   the data structures allocated at system boot end up in node zero. */
 
-	if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
-							MAX_NUMNODES) < 0)
+	if (sys_set_mempolicy(REQUEST_POLICY_PAGE | MPOL_INTERLEAVE, 
+		nodes_addr(node_online_map), MAX_NUMNODES) < 0)
 		printk("numa_policy_init: interleaving failed\n");
 }
 
@@ -1102,5 +1155,5 @@ void __init numa_policy_init(void)
  * Assumes fs == KERNEL_DS */
 void numa_default_policy(void)
 {
-	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
+	sys_set_mempolicy(REQUEST_POLICY_PAGE | MPOL_DEFAULT, NULL, 0);
 }
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2004-09-24  4:12 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-09-23 18:56 [PATCH 1/2] mm: page cache mempolicy for page cache allocation Ray Bryant
  -- strict thread matches above, loose matches on Subject: below --
2004-09-23  4:32 [PATCH 0/2] mm: memory policy " Ray Bryant
2004-09-23  4:32 ` [PATCH 1/2] mm: page cache mempolicy " Ray Bryant
2004-09-23  9:24   ` Andi Kleen
2004-09-24  4:12     ` Ray Bryant

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox