From: Ray Bryant <raybry@sgi.com>
To: William Lee Irwin III <wli@holomorphy.com>,
"Martin J. Bligh" <mbligh@aracnet.com>,
Andrew Morton <akpm@osdl.org>, Andi Kleen <ak@suse.de>,
Ray Bryant <raybry@austin.rr.com>
Cc: linux-mm <linux-mm@kvack.org>, Jesse Barnes <jbarnes@sgi.com>,
Dan Higgins <djh@sgi.com>, Dave Hansen <haveblue@us.ibm.com>,
lse-tech <lse-tech@lists.sourceforge.net>,
Brent Casavant <bcasavan@sgi.com>,
linux-kernel <linux-kernel@vger.kernel.org>,
Ray Bryant <raybry@sgi.com>, Paul Jackson <pj@sgi.com>,
Nick Piggin <piggin@cyberone.com.au>
Subject: [PATCH 2.6.9-rc2-mm1 1/2] mm: memory policy for page cache allocation
Date: Mon, 20 Sep 2004 12:00:38 -0700 (PDT) [thread overview]
Message-ID: <20040920190038.26965.18231.42543@tomahawk.engr.sgi.com> (raw)
In-Reply-To: <20040920190033.26965.64678.54625@tomahawk.engr.sgi.com>
This patch creates MPOL_ROUNDROBIN. This is like MPOL_INTERLEAVE,
but doesn't require a global offset or index to be specified.
Index: linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/sched.h
===================================================================
--- linux-2.6.9-rc1-mm3-kdb-pagecache.orig/include/linux/sched.h 2004-09-03 09:45:42.000000000 -0700
+++ linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/sched.h 2004-09-03 09:47:42.000000000 -0700
@@ -596,6 +596,7 @@
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next; /* could be shared with used_math */
+ short rr_next;
#endif
#ifdef CONFIG_CPUSETS
struct cpuset *cpuset;
Index: linux-2.6.9-rc1-mm3-kdb-pagecache/mm/mempolicy.c
===================================================================
--- linux-2.6.9-rc1-mm3-kdb-pagecache.orig/mm/mempolicy.c 2004-09-03 09:45:40.000000000 -0700
+++ linux-2.6.9-rc1-mm3-kdb-pagecache/mm/mempolicy.c 2004-09-03 09:47:42.000000000 -0700
@@ -7,10 +7,17 @@
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
- * Support four policies per VMA and per process:
+ * Support five policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
+ * roundrobin Allocate memory round-robined over a set of nodes,
+ * with normal fallback if it fails. The round-robin is
+ * based on a per-thread rotor both to provide predictability
+ * of allocation locations and to avoid cacheline contention
+ * compared to a global rotor. This policy is distinct from
+ * interleave in that it seeks to distribute allocations evenly
+ * across nodes, whereas interleave seeks to maximize bandwidth.
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
@@ -117,6 +124,7 @@
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
/* Preferred will only use the first bit, but allow
more for now. */
if (empty)
@@ -215,6 +223,7 @@
atomic_set(&policy->refcnt, 1);
switch (mode) {
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
break;
case MPOL_PREFERRED:
@@ -406,6 +415,8 @@
current->mempolicy = new;
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
+ if (new && new->policy == MPOL_ROUNDROBIN)
+ current->rr_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
return 0;
}
@@ -423,6 +434,7 @@
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
break;
case MPOL_PREFERRED:
@@ -507,6 +519,9 @@
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
pval = current->il_next;
+ } else if (pol == current->mempolicy &&
+ pol->policy == MPOL_ROUNDROBIN) {
+ pval = current->rr_next;
} else {
err = -EINVAL;
goto out;
@@ -585,6 +600,7 @@
return policy->v.zonelist;
/*FALL THROUGH*/
case MPOL_INTERLEAVE: /* should not happen */
+ case MPOL_ROUNDROBIN: /* should not happen */
case MPOL_DEFAULT:
nd = numa_node_id();
break;
@@ -595,6 +611,21 @@
return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
}
+/* Do dynamic round-robin for a process */
+static unsigned roundrobin_nodes(struct mempolicy *policy)
+{
+ unsigned nid, next;
+ struct task_struct *me = current;
+
+ nid = me->rr_next;
+ BUG_ON(nid >= MAX_NUMNODES);
+ next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
+ if (next >= MAX_NUMNODES)
+ next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
+ me->rr_next = next;
+ return nid;
+}
+
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
@@ -646,6 +677,27 @@
return page;
}
+/* Allocate a page in round-robin policy.
+ Own path because first fallback needs to round-robin. */
+static struct page *alloc_page_roundrobin(unsigned gfp, unsigned order, struct mempolicy* policy)
+{
+ struct zonelist *zl;
+ struct page *page;
+ unsigned nid;
+ int i, numnodes = bitmap_weight(policy->v.nodes, MAX_NUMNODES);
+
+ for (i = 0; i < numnodes; i++) {
+ nid = roundrobin_nodes(policy);
+ BUG_ON(!test_bit(nid, (const volatile void *) &node_online_map));
+ zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
+ page = __alloc_pages(gfp, order, zl);
+ if (page)
+ return page;
+ }
+
+ return NULL;
+}
+
/**
* alloc_page_vma - Allocate a page for a VMA.
*
@@ -671,26 +723,30 @@
struct page *
alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
{
+ unsigned nid;
struct mempolicy *pol = get_vma_policy(vma, addr);
cpuset_update_current_mems_allowed();
- if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
- unsigned nid;
- if (vma) {
- unsigned long off;
- BUG_ON(addr >= vma->vm_end);
- BUG_ON(addr < vma->vm_start);
- off = vma->vm_pgoff;
- off += (addr - vma->vm_start) >> PAGE_SHIFT;
- nid = offset_il_node(pol, vma, off);
- } else {
- /* fall back to process interleaving */
- nid = interleave_nodes(pol);
- }
- return alloc_page_interleave(gfp, 0, nid);
+ switch (pol->policy) {
+ case MPOL_INTERLEAVE:
+ if (vma) {
+ unsigned long off;
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+ off = vma->vm_pgoff;
+ off += (addr - vma->vm_start) >> PAGE_SHIFT;
+ nid = offset_il_node(pol, vma, off);
+ } else {
+ /* fall back to process interleaving */
+ nid = interleave_nodes(pol);
+ }
+ return alloc_page_interleave(gfp, 0, nid);
+ case MPOL_ROUNDROBIN:
+ return alloc_page_roundrobin(gfp, 0, pol);
+ default:
+ return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
}
- return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
}
/**
@@ -716,8 +772,11 @@
cpuset_update_current_mems_allowed();
if (!pol || in_interrupt())
pol = &default_policy;
- if (pol->policy == MPOL_INTERLEAVE)
+ if (pol->policy == MPOL_INTERLEAVE) {
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ } else if (pol->policy == MPOL_ROUNDROBIN) {
+ return alloc_page_roundrobin(gfp, order, pol);
+ }
return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
@@ -754,6 +813,7 @@
case MPOL_DEFAULT:
return 1;
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
@@ -798,6 +858,8 @@
return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
case MPOL_INTERLEAVE:
return interleave_nodes(pol);
+ case MPOL_ROUNDROBIN:
+ return roundrobin_nodes(pol);
case MPOL_PREFERRED:
return pol->v.preferred_node >= 0 ?
pol->v.preferred_node : numa_node_id();
@@ -815,6 +877,7 @@
case MPOL_PREFERRED:
case MPOL_DEFAULT:
case MPOL_INTERLEAVE:
+ case MPOL_ROUNDROBIN:
return 1;
case MPOL_BIND: {
struct zone **z;
Index: linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/mempolicy.h
===================================================================
--- linux-2.6.9-rc1-mm3-kdb-pagecache.orig/include/linux/mempolicy.h 2004-08-27 10:06:15.000000000 -0700
+++ linux-2.6.9-rc1-mm3-kdb-pagecache/include/linux/mempolicy.h 2004-09-16 09:27:08.000000000 -0700
@@ -13,8 +13,9 @@
#define MPOL_PREFERRED 1
#define MPOL_BIND 2
#define MPOL_INTERLEAVE 3
+#define MPOL_ROUNDROBIN 4
-#define MPOL_MAX MPOL_INTERLEAVE
+#define MPOL_MAX MPOL_ROUNDROBIN
/* Flags for get_mem_policy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant raybry@sgi.com
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>
next prev parent reply other threads:[~2004-09-20 19:00 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-09-20 19:00 [PATCH 2.6.9-rc2-mm1 0/2] " Ray Bryant
2004-09-20 19:00 ` Ray Bryant [this message]
2004-09-20 19:00 ` [PATCH 2.6.9-rc2-mm1 2/2] " Ray Bryant
2004-09-20 20:22 ` [PATCH 2.6.9-rc2-mm1 0/2] " Paul Jackson
2004-09-20 20:55 ` Andi Kleen
2004-09-20 22:13 ` Ray Bryant
2004-09-20 22:37 ` Andi Kleen
2004-09-20 23:16 ` William Lee Irwin III
2004-09-21 1:30 ` Ray Bryant
2004-09-21 9:13 ` Andi Kleen
2004-09-21 9:33 ` William Lee Irwin III
2004-09-21 13:10 ` Ray Bryant
2004-09-20 22:38 ` Steve Longerbeam
2004-09-20 23:48 ` Steve Longerbeam
2004-09-23 15:54 ` [PATCH " Ray Bryant
2004-09-23 23:01 ` Steve Longerbeam
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20040920190038.26965.18231.42543@tomahawk.engr.sgi.com \
--to=raybry@sgi.com \
--cc=ak@suse.de \
--cc=akpm@osdl.org \
--cc=bcasavan@sgi.com \
--cc=djh@sgi.com \
--cc=haveblue@us.ibm.com \
--cc=jbarnes@sgi.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lse-tech@lists.sourceforge.net \
--cc=mbligh@aracnet.com \
--cc=piggin@cyberone.com.au \
--cc=pj@sgi.com \
--cc=raybry@austin.rr.com \
--cc=wli@holomorphy.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox