* [PATCH] Get rid of zone_table
@ 2006-09-13 20:44 Christoph Lameter
2006-09-13 20:53 ` Dave Hansen
2006-09-14 10:33 ` Andy Whitcroft
0 siblings, 2 replies; 14+ messages in thread
From: Christoph Lameter @ 2006-09-13 20:44 UTC (permalink / raw)
To: Dave Hansen; +Cc: Andy Whitcroft, linux-mm
The zone table is mostly not needed. If we have a node in the page flags
then we can get to the zone via NODE_DATA(). In case of SMP and UP
NODE_DATA() is a constant pointer which allows us to access an exact
replica of zonetable in the node_zones field. In all of the above cases
there will be no need at all for the zone table.
The only remaining case is if in a NUMA system the node numbers do not fit
into the page flags. In that case we make sparse generate a table that
maps sections to nodes and use that table to to figure out the node
number.
For sparsemem the zone table seems to be have been fairly large based on
the maximum possible number of sections and the number of zones per node.
The section_to_node table (if we still need it) is still the size of the
number of sections but the individual elements are integers (which already
saves 50% on 64 bit platforms) and we do not need to duplicate the entries
per zone type. So even if we have to keep the table then we shrink it to
1/4th (32bit) or 1/8th )(64bit).
Tested on IA64(NUMA) and x86_64 (UP)
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.18-rc6-mm2/include/linux/mm.h
===================================================================
--- linux-2.6.18-rc6-mm2.orig/include/linux/mm.h 2006-09-13 14:17:24.798144329 -0500
+++ linux-2.6.18-rc6-mm2/include/linux/mm.h 2006-09-13 15:42:22.040414207 -0500
@@ -395,7 +395,9 @@
* We are going to use the flags for the page to node mapping if its in
* there. This includes the case where there is no node, so it is implicit.
*/
-#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0)
+#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+#define NODE_NOT_IN_PAGE_FLAGS
+#endif
#ifndef PFN_SECTION_SHIFT
#define PFN_SECTION_SHIFT 0
@@ -410,13 +412,13 @@
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
-/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
-#if FLAGS_HAS_NODE
-#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT)
+/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
#else
-#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
+#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
#endif
-#define ZONETABLE_PGSHIFT ZONES_PGSHIFT
+#define ZONEID_PGSHIFT ZONES_PGSHIFT
#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
@@ -425,23 +427,24 @@
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
-#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
+#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
static inline enum zone_type page_zonenum(struct page *page)
{
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}
-struct zone;
-extern struct zone *zone_table[];
-
+/*
+ * The identification function is only used by the buddy allocator for
+ * determining if two pages could be buddies. We are not really
+ * identify a zone since we could be using a the section number
+ * id if we have not node id available in page flags.
+ * We guarantee only that it will return the same value for two
+ * combinable pages in a zone.
+ */
static inline int page_zone_id(struct page *page)
{
- return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
-}
-static inline struct zone *page_zone(struct page *page)
-{
- return zone_table[page_zone_id(page)];
+ return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}
static inline unsigned long zone_to_nid(struct zone *zone)
@@ -449,13 +452,20 @@
return zone->zone_pgdat->node_id;
}
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+extern unsigned long page_to_nid(struct page *page);
+#else
static inline unsigned long page_to_nid(struct page *page)
{
- if (FLAGS_HAS_NODE)
- return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
- else
- return zone_to_nid(page_zone(page));
+ return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
}
+#endif
+
+static inline struct zone *page_zone(struct page *page)
+{
+ return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
+}
+
static inline unsigned long page_to_section(struct page *page)
{
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
@@ -472,6 +482,7 @@
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}
+
static inline void set_page_section(struct page *page, unsigned long section)
{
page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
@@ -972,8 +983,6 @@
extern void show_mem(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
-extern void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
- unsigned long pfn, unsigned long size);
#ifdef CONFIG_NUMA
extern void setup_per_cpu_pageset(void);
Index: linux-2.6.18-rc6-mm2/mm/sparse.c
===================================================================
--- linux-2.6.18-rc6-mm2.orig/mm/sparse.c 2006-09-13 14:17:24.805957488 -0500
+++ linux-2.6.18-rc6-mm2/mm/sparse.c 2006-09-13 15:10:24.845606274 -0500
@@ -24,6 +24,21 @@
#endif
EXPORT_SYMBOL(mem_section);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+/*
+ * If we did not store the node number in the page then we have to
+ * do a lookup in the section_to_node_table in order to find which
+ * node the page belongs to.
+ */
+static int section_to_node_table[NR_MEM_SECTIONS];
+
+extern unsigned long page_to_nid(struct page *page)
+{
+ return section_to_node_table[page_to_section(page)];
+}
+EXPORT_SYMBOL(page_to_nid);
+#endif
+
#ifdef CONFIG_SPARSEMEM_EXTREME
static struct mem_section *sparse_index_alloc(int nid)
{
@@ -49,6 +64,10 @@
struct mem_section *section;
int ret = 0;
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ section_to_node_table[section_nr] = nid;
+#endif
+
if (mem_section[root])
return -EEXIST;
Index: linux-2.6.18-rc6-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.18-rc6-mm2.orig/mm/page_alloc.c 2006-09-13 14:17:24.812794002 -0500
+++ linux-2.6.18-rc6-mm2/mm/page_alloc.c 2006-09-13 14:18:11.739602442 -0500
@@ -82,13 +82,6 @@
EXPORT_SYMBOL(totalram_pages);
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
static char *zone_names[MAX_NR_ZONES] = {
"DMA",
#ifdef CONFIG_ZONE_DMA32
@@ -1808,20 +1801,6 @@
}
}
-#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
- unsigned long pfn, unsigned long size)
-{
- unsigned long snum = pfn_to_section_nr(pfn);
- unsigned long end = pfn_to_section_nr(pfn + size);
-
- if (FLAGS_HAS_NODE)
- zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
- else
- for (; snum <= end; snum++)
- zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -2525,7 +2504,6 @@
if (!size)
continue;
- zonetable_add(zone, nid, j, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
zone_start_pfn += size;
Index: linux-2.6.18-rc6-mm2/mm/memory_hotplug.c
===================================================================
--- linux-2.6.18-rc6-mm2.orig/mm/memory_hotplug.c 2006-09-13 14:17:24.823537096 -0500
+++ linux-2.6.18-rc6-mm2/mm/memory_hotplug.c 2006-09-13 14:18:11.750345535 -0500
@@ -72,7 +72,6 @@
return ret;
}
memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
- zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
return 0;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 20:44 [PATCH] Get rid of zone_table Christoph Lameter
@ 2006-09-13 20:53 ` Dave Hansen
2006-09-13 21:40 ` Christoph Lameter
2006-09-14 10:33 ` Andy Whitcroft
1 sibling, 1 reply; 14+ messages in thread
From: Dave Hansen @ 2006-09-13 20:53 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andy Whitcroft, linux-mm
On Wed, 2006-09-13 at 13:44 -0700, Christoph Lameter wrote:
> The zone table is mostly not needed. If we have a node in the page flags
> then we can get to the zone via NODE_DATA(). In case of SMP and UP
> NODE_DATA() is a constant pointer which allows us to access an exact
> replica of zonetable in the node_zones field. In all of the above cases
> there will be no need at all for the zone table.
>
> The only remaining case is if in a NUMA system the node numbers do not fit
> into the page flags. In that case we make sparse generate a table that
> maps sections to nodes and use that table to to figure out the node
> number.
>
> For sparsemem the zone table seems to be have been fairly large based on
> the maximum possible number of sections and the number of zones per node.
>
> The section_to_node table (if we still need it) is still the size of the
> number of sections but the individual elements are integers (which already
> saves 50% on 64 bit platforms) and we do not need to duplicate the entries
> per zone type. So even if we have to keep the table then we shrink it to
> 1/4th (32bit) or 1/8th )(64bit).
It doesn't feel like this is the best fit to go with sparsemem, but the
impact is pretty tiny, and it does seem somewhat sensible to put it
there.
A few concerns: is there a cache or readability impact from keeping this
structure separate from the mem_section, when it is logically and
functionally pretty paired with it? It doesn't work with
SPARSEMEM_EXTREME (it would just blow up horribly), and this part at
least deserves a comment. Is there any impact from making this a
non-inlined call, unlike the old zonetable lookup?
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 20:53 ` Dave Hansen
@ 2006-09-13 21:40 ` Christoph Lameter
2006-09-13 21:47 ` Dave Hansen
0 siblings, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2006-09-13 21:40 UTC (permalink / raw)
To: Dave Hansen; +Cc: Andy Whitcroft, linux-mm
On Wed, 13 Sep 2006, Dave Hansen wrote:
> > The section_to_node table (if we still need it) is still the size of the
> > number of sections but the individual elements are integers (which already
> > saves 50% on 64 bit platforms) and we do not need to duplicate the entries
> > per zone type. So even if we have to keep the table then we shrink it to
> > 1/4th (32bit) or 1/8th )(64bit).
>
> It doesn't feel like this is the best fit to go with sparsemem, but the
> impact is pretty tiny, and it does seem somewhat sensible to put it
> there.
>
> A few concerns: is there a cache or readability impact from keeping this
> structure separate from the mem_section, when it is logically and
> functionally pretty paired with it? It doesn't work with
> SPARSEMEM_EXTREME (it would just blow up horribly), and this part at
> least deserves a comment. Is there any impact from making this a
> non-inlined call, unlike the old zonetable lookup?
I am not that familiar with sparsemem thats why I asked you about it at
first. I doubt there is much of an impact from making this non inlined.
IMHO it is clearer and easier to maintain if the code to do the section
lookup is put with the code that generates the sections. Its also an
exceptional thing that is not needed in general.
The main performance issue is probably the number of cachelines touched
and the situation gets better here even for the worst case that we have to
keep a separate lookup array. The array is denser.
For page_zone(page) one would have to do two lookups in the worst case.
One to get the node id and then another one in NODE_DATA() to get to the
zone. However, the NODE_DATA()is frequently referenced so its likely to be
in cache. The existing 3 lookups for page_to_nid() are reduced
to a single lookup in the section_to_node_table(). Before we had to
determine the zone and then fetch the corresponding pgdat address and then
fetch the node number from the pgdat structure (yuck).
You could put the node number with the section (put it in a separate
cacheline before the start of the memsection array?) but then it would be
in a cacheline of its own. This way you have the node number of a set of
neighboring sections in one cacheline. With a 128 byte cacheline you
have the nodes for the 32 neighboring section of memory.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 21:40 ` Christoph Lameter
@ 2006-09-13 21:47 ` Dave Hansen
2006-09-13 21:54 ` Christoph Lameter
2006-09-15 13:28 ` Andy Whitcroft
0 siblings, 2 replies; 14+ messages in thread
From: Dave Hansen @ 2006-09-13 21:47 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andy Whitcroft, linux-mm
Now that I think about it, we should have room to encode that thing
inside of the section number on 32-bit platforms.
We have 32-bits of space, and we need to encode a number that is a
maximum of 4 bits in size. That leaves 28 bits minus the one that we
use for the section present bit. Our minimum section size on x86 is
something like 64 or 128MB. Let's say 64MB. So, on a 64GB system, we
only need 1k sections, and 10 bits.
So, the node number would almost certainly fit in the existing
mem_section. We'd just need to set it and mask it out.
Andy, what do you think?
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 21:47 ` Dave Hansen
@ 2006-09-13 21:54 ` Christoph Lameter
2006-09-13 21:58 ` Dave Hansen
2006-09-15 13:28 ` Andy Whitcroft
1 sibling, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2006-09-13 21:54 UTC (permalink / raw)
To: Dave Hansen; +Cc: Andy Whitcroft, linux-mm
On Wed, 13 Sep 2006, Dave Hansen wrote:
> Now that I think about it, we should have room to encode that thing
> inside of the section number on 32-bit platforms.
We already have 1k nodes on IA64 and you can expect 16k in the
near future. I think you need at least 16 bit.
Sorry I am a bit new to sparsemem but it seems that the mem sections are
arrays of pointers. You would like to store the node number in the lower
unused bits?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 21:54 ` Christoph Lameter
@ 2006-09-13 21:58 ` Dave Hansen
2006-09-13 22:02 ` Christoph Lameter
0 siblings, 1 reply; 14+ messages in thread
From: Dave Hansen @ 2006-09-13 21:58 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Andy Whitcroft, linux-mm
On Wed, 2006-09-13 at 14:54 -0700, Christoph Lameter wrote:
> On Wed, 13 Sep 2006, Dave Hansen wrote:
> > Now that I think about it, we should have room to encode that thing
> > inside of the section number on 32-bit platforms.
>
> We already have 1k nodes on IA64 and you can expect 16k in the
> near future. I think you need at least 16 bit.
>
> Sorry I am a bit new to sparsemem but it seems that the mem sections are
> arrays of pointers. You would like to store the node number in the lower
> unused bits?
I thought this patch was only for 32-bit NUMA platforms that have run
out of bits in page->flags to encode the data. Does it apply to ia64 as
well somehow?
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 21:58 ` Dave Hansen
@ 2006-09-13 22:02 ` Christoph Lameter
0 siblings, 0 replies; 14+ messages in thread
From: Christoph Lameter @ 2006-09-13 22:02 UTC (permalink / raw)
To: Dave Hansen; +Cc: Andy Whitcroft, linux-mm
On Wed, 13 Sep 2006, Dave Hansen wrote:
> > Sorry I am a bit new to sparsemem but it seems that the mem sections are
> > arrays of pointers. You would like to store the node number in the lower
> > unused bits?
>
> I thought this patch was only for 32-bit NUMA platforms that have run
> out of bits in page->flags to encode the data. Does it apply to ia64 as
> well somehow?
Yes, the section_to_node_table is only for 32 bit NUMA platforms that ran
out of bits. Aha. Then you can work within the restrictions of that
environment and you do not have to be general.
If you only need 4 bits then you could take those from the first two
pointers of a memsection and maybe you could find them elsewhere.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 20:44 [PATCH] Get rid of zone_table Christoph Lameter
2006-09-13 20:53 ` Dave Hansen
@ 2006-09-14 10:33 ` Andy Whitcroft
2006-09-14 21:46 ` Christoph Lameter
1 sibling, 1 reply; 14+ messages in thread
From: Andy Whitcroft @ 2006-09-14 10:33 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Dave Hansen, linux-mm
[Appologies to Dave or yourself if what I am about to say has been
discussed already, but I wanted to read your intent and patch without
any preconceptions of it.]
First I wanted to be sure I have understood what you are proposing,
whilst trying to figure that out I came up with the following diagrams.
In the two diagrams below, we show the two main scenarios. First page
flags as they appear when NODE,ZONE will fit (FLATMEM and DISCONTIGMEM),
at the top of the diagram. Second page flags as they appear when
NODE,ZONE will not fit (SPARSEMEM on 32 bit), at the bottom of the
diagram. The boxes are intended to represent where there is an
indirection through a table/pointer.
Current implementation:
| Node | Zone | [Section] | xxxxx | Flags |
\___________/
|
|
v
+-----------+
| zonetable |-----> &zone
+-----------+
^
|
_____|__________________________
/ \
| Section | Zone | Flags |
Proposed implementation:
| Node | Zone | [Section] | xxxxx | Flags |
\____/ \____/
| |__________________
.- - -|- - - - - - - -. |
. v . v
. +-----------+ . +-----------+
. | node_data |--&node-->| NODE_DATA |----> &zone
. +-----------+ . +-----------+
. ^ . ^
- - -|- - - - - - - -A |
| |
+---------------+ |
| section_table | |
+---------------+ |
^ |
| |
__|_____________________ _|__
/ \ / \
| Section | Zone | Flags |
Christoph Lameter wrote:
> The zone table is mostly not needed. If we have a node in the page flags
> then we can get to the zone via NODE_DATA(). In case of SMP and UP
> NODE_DATA() is a constant pointer which allows us to access an exact
> replica of zonetable in the node_zones field. In all of the above cases
> there will be no need at all for the zone table.
Ok here we are talking about the segment of the second diagram ringed
and marked A. Yes the compiler/we should be able to optimise this case
to directly use the zonelist. However, this is also true of the current
scheme and would be a fairly trivial change in that framework.
Something like the below.
@@ -477,7 +477,10 @@ static inline int page_zone_id(struct pa
}
static inline struct zone *page_zone(struct page *page)
{
- return zone_table[page_zone_id(page)];
+ if (NODE_SHIFT)
+ return zone_table[page_zone_id(page)];
+ else
+ return NODE_DATA(0)->node_zones[page_zonenum(page)];
}
static inline unsigned long page_to_nid(struct page *page)
@@@
A similar thing could be done for page_to_nid which should always be zero.
> The only remaining case is if in a NUMA system the node numbers do not fit
> into the page flags. In that case we make sparse generate a table that
> maps sections to nodes and use that table to to figure out the node
> number.
>
> For sparsemem the zone table seems to be have been fairly large based on
> the maximum possible number of sections and the number of zones per node.
>
> The section_to_node table (if we still need it) is still the size of the
> number of sections but the individual elements are integers (which already
> saves 50% on 64 bit platforms) and we do not need to duplicate the entries
> per zone type. So even if we have to keep the table then we shrink it to
> 1/4th (32bit) or 1/8th )(64bit).
Ok, this is based on half for moving from a pointer to an integer. The
rest is based on the fact we have 4 zones. Given most sane
architectures only have ZONE_DMA we should be able to get a large
percentage of this saving just from knowing the highest 'valid' zone per
architecture.
If we consider the zone_table size for the easy case where NODE,ZONE is
in flags, this is of the order of (zones * nodes * sizeof(*)). As nodes
are numbered sequentially in most systems this has a cache foot print of
the order of (zones * active nodes * sizeof(*)). Worst case 4 * 1024 *
8 == 32KB, more typical usage of 4 * 8 * 8 = 256B.
Let us consider the likely sizes of the zone_table for a SPARSEMEM
configuration:
1) the 32bit case. Here we have a limitation of a maximum of 6 bits
worth of sections (64 of them). So the maximum zone_table size is 4 *
64 * 4 == 1024, so 1KB of zone_table.
2) the 64bit case. If we assume we have 1024 node limit plus 4 zones,
then we can have 256k sections before we will not be able to fit the
node in. So if we assume a 4MB section size (which is low) then we can
represent 1TB of ram before that occurs? As sections are intended to
represent the installable unit for a machine, that should tend to scale
with the overall memory size as machines tend to have a maximum physical
slots. So my expectations for a machine with 1TB of ram is that the
memory would come in 256MB or even 1GB increments. Thus we should be
able to represent 256TB without needing a zone_table. So any savings on
64bit feel illusory. All of these calculations are without the
optimisation of removing the zone when we only have 1 active zone, or
moving the zone down into the bottom half of the flags (logically at least).
>
> Tested on IA64(NUMA) and x86_64 (UP)
>
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
>
General comments. Although this may seem of the same order of
complexity and therefore a performance drop in, there does seem to be a
significant number of additional indirections on a NUMA system.
Particularly in 32 bit, for 64 bit we should never expect node/zone to
be absent from the flags. Of course there is a cache footprint trade
off here, which may make these additional indirections very cheap as
node_data may well be hot anyway so there is a case for comparitive
benchmarks.
I can see a very valid case for optimising the UP/SMP case where
NODE_DATA is a constant. But that could be optimised as I indicate
above without a complete rewrite.
I guess this all means much more if you have a SPARSMEME section/node
count configuration that significantly busts the 256Tb/1024 node
combinations on 64bit.
Finally, if the change here was a valid one benchmark wise or whatever,
I think it would be nicer to push this in through the same interface we
currently have as that would allow other shaped zone_tables to be
brought back should a new memory layout come along.
-apw
> Index: linux-2.6.18-rc6-mm2/include/linux/mm.h
> ===================================================================
> --- linux-2.6.18-rc6-mm2.orig/include/linux/mm.h 2006-09-13 14:17:24.798144329 -0500
> +++ linux-2.6.18-rc6-mm2/include/linux/mm.h 2006-09-13 15:42:22.040414207 -0500
> @@ -395,7 +395,9 @@
> * We are going to use the flags for the page to node mapping if its in
> * there. This includes the case where there is no node, so it is implicit.
> */
> -#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0)
> +#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
> +#define NODE_NOT_IN_PAGE_FLAGS
> +#endif
>
> #ifndef PFN_SECTION_SHIFT
> #define PFN_SECTION_SHIFT 0
> @@ -410,13 +412,13 @@
> #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
> #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
>
> -/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
> -#if FLAGS_HAS_NODE
> -#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT)
> +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allcator */
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> +#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
> #else
> -#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
> +#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
> #endif
> -#define ZONETABLE_PGSHIFT ZONES_PGSHIFT
> +#define ZONEID_PGSHIFT ZONES_PGSHIFT
>
> #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
> #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
> @@ -425,23 +427,24 @@
> #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
> #define NODES_MASK ((1UL << NODES_WIDTH) - 1)
> #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
> -#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
> +#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
>
> static inline enum zone_type page_zonenum(struct page *page)
> {
> return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
> }
>
> -struct zone;
> -extern struct zone *zone_table[];
> -
> +/*
> + * The identification function is only used by the buddy allocator for
> + * determining if two pages could be buddies. We are not really
> + * identify a zone since we could be using a the section number
> + * id if we have not node id available in page flags.
> + * We guarantee only that it will return the same value for two
> + * combinable pages in a zone.
> + */
> static inline int page_zone_id(struct page *page)
> {
> - return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
> -}
> -static inline struct zone *page_zone(struct page *page)
> -{
> - return zone_table[page_zone_id(page)];
> + return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
> }
>
> static inline unsigned long zone_to_nid(struct zone *zone)
> @@ -449,13 +452,20 @@
> return zone->zone_pgdat->node_id;
> }
>
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> +extern unsigned long page_to_nid(struct page *page);
> +#else
> static inline unsigned long page_to_nid(struct page *page)
> {
> - if (FLAGS_HAS_NODE)
> - return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> - else
> - return zone_to_nid(page_zone(page));
> + return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> }
> +#endif
> +
> +static inline struct zone *page_zone(struct page *page)
> +{
> + return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> +}
> +
> static inline unsigned long page_to_section(struct page *page)
> {
> return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
> @@ -472,6 +482,7 @@
> page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
> page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
> }
> +
> static inline void set_page_section(struct page *page, unsigned long section)
> {
> page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
> @@ -972,8 +983,6 @@
> extern void show_mem(void);
> extern void si_meminfo(struct sysinfo * val);
> extern void si_meminfo_node(struct sysinfo *val, int nid);
> -extern void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
> - unsigned long pfn, unsigned long size);
>
> #ifdef CONFIG_NUMA
> extern void setup_per_cpu_pageset(void);
> Index: linux-2.6.18-rc6-mm2/mm/sparse.c
> ===================================================================
> --- linux-2.6.18-rc6-mm2.orig/mm/sparse.c 2006-09-13 14:17:24.805957488 -0500
> +++ linux-2.6.18-rc6-mm2/mm/sparse.c 2006-09-13 15:10:24.845606274 -0500
> @@ -24,6 +24,21 @@
> #endif
> EXPORT_SYMBOL(mem_section);
>
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> +/*
> + * If we did not store the node number in the page then we have to
> + * do a lookup in the section_to_node_table in order to find which
> + * node the page belongs to.
> + */
> +static int section_to_node_table[NR_MEM_SECTIONS];
> +
> +extern unsigned long page_to_nid(struct page *page)
> +{
> + return section_to_node_table[page_to_section(page)];
> +}
> +EXPORT_SYMBOL(page_to_nid);
> +#endif
> +
> #ifdef CONFIG_SPARSEMEM_EXTREME
> static struct mem_section *sparse_index_alloc(int nid)
> {
> @@ -49,6 +64,10 @@
> struct mem_section *section;
> int ret = 0;
>
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> + section_to_node_table[section_nr] = nid;
> +#endif
> +
> if (mem_section[root])
> return -EEXIST;
>
> Index: linux-2.6.18-rc6-mm2/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.18-rc6-mm2.orig/mm/page_alloc.c 2006-09-13 14:17:24.812794002 -0500
> +++ linux-2.6.18-rc6-mm2/mm/page_alloc.c 2006-09-13 14:18:11.739602442 -0500
> @@ -82,13 +82,6 @@
>
> EXPORT_SYMBOL(totalram_pages);
>
> -/*
> - * Used by page_zone() to look up the address of the struct zone whose
> - * id is encoded in the upper bits of page->flags
> - */
> -struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
> -EXPORT_SYMBOL(zone_table);
> -
> static char *zone_names[MAX_NR_ZONES] = {
> "DMA",
> #ifdef CONFIG_ZONE_DMA32
> @@ -1808,20 +1801,6 @@
> }
> }
>
> -#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
> -void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
> - unsigned long pfn, unsigned long size)
> -{
> - unsigned long snum = pfn_to_section_nr(pfn);
> - unsigned long end = pfn_to_section_nr(pfn + size);
> -
> - if (FLAGS_HAS_NODE)
> - zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
> - else
> - for (; snum <= end; snum++)
> - zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
> -}
> -
> #ifndef __HAVE_ARCH_MEMMAP_INIT
> #define memmap_init(size, nid, zone, start_pfn) \
> memmap_init_zone((size), (nid), (zone), (start_pfn))
> @@ -2525,7 +2504,6 @@
> if (!size)
> continue;
>
> - zonetable_add(zone, nid, j, zone_start_pfn, size);
> ret = init_currently_empty_zone(zone, zone_start_pfn, size);
> BUG_ON(ret);
> zone_start_pfn += size;
> Index: linux-2.6.18-rc6-mm2/mm/memory_hotplug.c
> ===================================================================
> --- linux-2.6.18-rc6-mm2.orig/mm/memory_hotplug.c 2006-09-13 14:17:24.823537096 -0500
> +++ linux-2.6.18-rc6-mm2/mm/memory_hotplug.c 2006-09-13 14:18:11.750345535 -0500
> @@ -72,7 +72,6 @@
> return ret;
> }
> memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
> - zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
> return 0;
> }
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-14 10:33 ` Andy Whitcroft
@ 2006-09-14 21:46 ` Christoph Lameter
2006-09-15 13:07 ` Andy Whitcroft
0 siblings, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2006-09-14 21:46 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: Dave Hansen, linux-mm
On Thu, 14 Sep 2006, Andy Whitcroft wrote:
> Proposed implementation:
>
> | Node | Zone | [Section] | xxxxx | Flags |
> \____/ \____/
> | |__________________
> .- - -|- - - - - - - -. |
Right. There is one lookup here in the node_data array. The combination
with the zone is an address calculation and does not require a lookup.
> . v . v
> . +-----------+ . +-----------+
> . | node_data |--&node-->| NODE_DATA |----> &zone
> . +-----------+ . +-----------+
> . ^ . ^
> - - -|- - - - - - - -A |
> | |
> +---------------+ |
> | section_table | |
> +---------------+ |
Right here is the second lookup for the case in which the section does not
fit.
> ^ |
> | |
> __|_____________________ _|__
> / \ / \
> | Section | Zone | Flags |
>
>
> Christoph Lameter wrote:
> > The zone table is mostly not needed. If we have a node in the page flags
> > then we can get to the zone via NODE_DATA(). In case of SMP and UP
> > NODE_DATA() is a constant pointer which allows us to access an exact
> > replica of zonetable in the node_zones field. In all of the above cases
> > there will be no need at all for the zone table.
>
> Ok here we are talking about the segment of the second diagram ringed
> and marked A. Yes the compiler/we should be able to optimise this case
> to directly use the zonelist. However, this is also true of the current
> scheme and would be a fairly trivial change in that framework.
What would the compiler optimize? You mean the zonelist in the node
structure or the zonetable?
>
> Something like the below.
>
> @@ -477,7 +477,10 @@ static inline int page_zone_id(struct pa
> }
> static inline struct zone *page_zone(struct page *page)
> {
> - return zone_table[page_zone_id(page)];
> + if (NODE_SHIFT)
> + return zone_table[page_zone_id(page)];
> + else
> + return NODE_DATA(0)->node_zones[page_zonenum(page)];
> }
Yes that code was proposed in the RFC. See linux-mm. Dave suggested that
we can eliminate the zone_table or the section_to_node_table completely
because we can actually fit the node into the page flags with some
adjustments to sparsemem.
> A similar thing could be done for page_to_nid which should always be zero.
page_to_nid already uses page_zone in that case.
> > The section_to_node table (if we still need it) is still the size of the
> > number of sections but the individual elements are integers (which already
> > saves 50% on 64 bit platforms) and we do not need to duplicate the entries
> > per zone type. So even if we have to keep the table then we shrink it to
> > 1/4th (32bit) or 1/8th )(64bit).
>
> Ok, this is based on half for moving from a pointer to an integer. The
> rest is based on the fact we have 4 zones. Given most sane
> architectures only have ZONE_DMA we should be able to get a large
> percentage of this saving just from knowing the highest 'valid' zone per
> architecture.
NUMAQ only populates HIGHMEM on nodes other than zero. You will get
no benefit with such a scheme.
> Let us consider the likely sizes of the zone_table for a SPARSEMEM
> configuration:
>
> 1) the 32bit case. Here we have a limitation of a maximum of 6 bits
> worth of sections (64 of them). So the maximum zone_table size is 4 *
> 64 * 4 == 1024, so 1KB of zone_table.
Can we fit the node in there for all possible 32 bit NUMA machines?
> General comments. Although this may seem of the same order of
> complexity and therefore a performance drop in, there does seem to be a
> significant number of additional indirections on a NUMA system.
Could you tell me wher the "indirections" come from? AFAIK there is only
one additional indirection that is offset by the NODE_DATA array being
cache hot. page_to_nid goes from 3 indirections to one with this scheme.
> I can see a very valid case for optimising the UP/SMP case where
> NODE_DATA is a constant. But that could be optimised as I indicate
> above without a complete rewrite.
Could you have a look at the RFC wich does exactly that?
> Finally, if the change here was a valid one benchmark wise or whatever,
> I think it would be nicer to push this in through the same interface we
> currently have as that would allow other shaped zone_tables to be
> brought back should a new memory layout come along.
It would be best to eliminate the zone_table or my section_to_node_table
completely. The section_to_node_table does not require maintanance
in the page allocator as the zone_table does.
> > Index: linux-2.6.18-rc6-mm2/include/linux/mm.h
> > ===================================================================
I could not find any comments in here. Please cut down emails as much as
possible.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-14 21:46 ` Christoph Lameter
@ 2006-09-15 13:07 ` Andy Whitcroft
0 siblings, 0 replies; 14+ messages in thread
From: Andy Whitcroft @ 2006-09-15 13:07 UTC (permalink / raw)
To: Christoph Lameter; +Cc: Dave Hansen, linux-mm
Christoph Lameter wrote:
> On Thu, 14 Sep 2006, Andy Whitcroft wrote:
>
>> Proposed implementation:
>>
>> | Node | Zone | [Section] | xxxxx | Flags |
>> \____/ \____/
>> | |__________________
>> .- - -|- - - - - - - -. |
>
> Right. There is one lookup here in the node_data array. The combination
> with the zone is an address calculation and does not require a lookup.
>
Yes, you are correct, the zones are in the node, so my diagram is
missleading it should be:
| Node | Zone | [Section] | xxxxx | Flags |
\____/ \____/
| |__________________
.- - -|- - - - - - - -. |
. v . |
. +-----------+ . |
. | node_data |--&node----------+----> &zone
. +-----------+ . ^
. ^ . |
- - -|- - - - - - - -A |
| |
+---------------+ |
| section_table | |
+---------------+ |
^ |
| |
__|_____________________ _|__
/ \ / \
| Section | Zone | Flags |
Now as you say (in the non-SPARSMEM case) this is equivalent to the
zone_table lookup, but has the advantage of the node_data already likely
to be hot. If we can say that node_data is sufficiently hot, then the
shrink in the size of the section_table and that together should help
amortise the additional lookup.
I think we do need to benchmark this comparitivly to see.
I've also tried to respond to you comments below, but the key message
here is that this is an interesting change that we need to perf. test to
see its impact. If its not measureable then it seems reasonable.
>> . v . v
>> . +-----------+ . +-----------+
>> . | node_data |--&node-->| NODE_DATA |----> &zone
>> . +-----------+ . +-----------+
>> . ^ . ^
>> - - -|- - - - - - - -A |
>> | |
>> +---------------+ |
>> | section_table | |
>> +---------------+ |
>
> Right here is the second lookup for the case in which the section does not
> fit.
>
>> ^ |
>> | |
>> __|_____________________ _|__
>> / \ / \
>> | Section | Zone | Flags |
>>
>>
>> Christoph Lameter wrote:
>>> The zone table is mostly not needed. If we have a node in the page flags
>>> then we can get to the zone via NODE_DATA(). In case of SMP and UP
>>> NODE_DATA() is a constant pointer which allows us to access an exact
>>> replica of zonetable in the node_zones field. In all of the above cases
>>> there will be no need at all for the zone table.
>> Ok here we are talking about the segment of the second diagram ringed
>> and marked A. Yes the compiler/we should be able to optimise this case
>> to directly use the zonelist. However, this is also true of the current
>> scheme and would be a fairly trivial change in that framework.
>
> What would the compiler optimize? You mean the zonelist in the node
> structure or the zonetable?
>
I am saying that when you express the zone lookup in terms of the
NODE_DATA(nid) in the non-NUMA case the compiler/optimiser has
sufficient information to make collapse the whole thing into a constant.
>> Something like the below.
>>
>> @@ -477,7 +477,10 @@ static inline int page_zone_id(struct pa
>> }
>> static inline struct zone *page_zone(struct page *page)
>> {
>> - return zone_table[page_zone_id(page)];
>> + if (NODE_SHIFT)
>> + return zone_table[page_zone_id(page)];
>> + else
>> + return NODE_DATA(0)->node_zones[page_zonenum(page)];
>> }
>
> Yes that code was proposed in the RFC. See linux-mm. Dave suggested that
> we can eliminate the zone_table or the section_to_node_table completely
> because we can actually fit the node into the page flags with some
> adjustments to sparsemem.
This patch fragment, implements the optimisation in your code (I've
boxed A) for the non-NUMA case, but does it within the old code
framework. ie just this change should be enough to get all the benfits
you indicate for UP and SMP. I am wondering if all the other change is
needed to get the benefit.
>> A similar thing could be done for page_to_nid which should always be zero.
>
> page_to_nid already uses page_zone in that case.
>
>>> The section_to_node table (if we still need it) is still the size of the
>>> number of sections but the individual elements are integers (which already
>>> saves 50% on 64 bit platforms) and we do not need to duplicate the entries
>>> per zone type. So even if we have to keep the table then we shrink it to
>>> 1/4th (32bit) or 1/8th )(64bit).
>> Ok, this is based on half for moving from a pointer to an integer. The
>> rest is based on the fact we have 4 zones. Given most sane
>> architectures only have ZONE_DMA we should be able to get a large
>> percentage of this saving just from knowing the highest 'valid' zone per
>> architecture.
>
> NUMAQ only populates HIGHMEM on nodes other than zero. You will get
> no benefit with such a scheme.
>
Yes, but NUMA-Q is not a sane architecture :). I am talking about the
places we more care, like 64bit. Though I tried to say later that I
don't think we ever hit this scenario for 64 bit systems.
>> Let us consider the likely sizes of the zone_table for a SPARSEMEM
>> configuration:
>>
>> 1) the 32bit case. Here we have a limitation of a maximum of 6 bits
>> worth of sections (64 of them). So the maximum zone_table size is 4 *
>> 64 * 4 == 1024, so 1KB of zone_table.
>
> Can we fit the node in there for all possible 32 bit NUMA machines?
No its never fits in there when sparsmem is enabled on 32 bit, as we use
the same bits normally reserved for the node. The zone_table is
actually the same size either way in the current code.
>> General comments. Although this may seem of the same order of
>> complexity and therefore a performance drop in, there does seem to be a
>> significant number of additional indirections on a NUMA system.
>
> Could you tell me wher the "indirections" come from? AFAIK there is only
> one additional indirection that is offset by the NODE_DATA array being
> cache hot. page_to_nid goes from 3 indirections to one with this scheme.
With the clarification to the zone lookup we are indeed at one
additional indirection. They key decision when forming this layout was
that page_zone was used heavily on hot paths, page_to_nid was not used
very often at all. So an optimisation there was not as valuable; this
may of course no longer be the case.
>
>> I can see a very valid case for optimising the UP/SMP case where
>> NODE_DATA is a constant. But that could be optimised as I indicate
>> above without a complete rewrite.
>
> Could you have a look at the RFC wich does exactly that?
I did read the RFC, my point here was that you make two assertions, and
I was saying the first was clearly right and could be implemented
separately. The second seemed to need validating.
1) that in the UP/SMP case we don't need the zone_table at all its just
dumb having it -- this seems very valid, but can be implemented in the
current framework as above with just those three lines of change.
2) that the zone_table is huge on 64 bit systems with lots of nodes --
which could be true, but I have yet to be convinced. The key here is
that we need to force the NODE out of the flags for this code to strike
and I conjecture there isn't a 64 bit system that does it. Now your
changes for this add an additional indirection to shrink the table, but
the table is pretty small in the use cases where I'd expect to see it
used (32 bit sparsemem).
>> Finally, if the change here was a valid one benchmark wise or whatever,
>> I think it would be nicer to push this in through the same interface we
>> currently have as that would allow other shaped zone_tables to be
>> brought back should a new memory layout come along.
>
> It would be best to eliminate the zone_table or my section_to_node_table
> completely. The section_to_node_table does not require maintanance
> in the page allocator as the zone_table does.
>
>>> Index: linux-2.6.18-rc6-mm2/include/linux/mm.h
>>> ===================================================================
>
> I could not find any comments in here. Please cut down emails as much as
> possible.
Soz. Had conflicting requests on this one. My preference is for shortness.
-apw
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-13 21:47 ` Dave Hansen
2006-09-13 21:54 ` Christoph Lameter
@ 2006-09-15 13:28 ` Andy Whitcroft
2006-09-15 16:32 ` Dave Hansen
2006-09-15 17:13 ` Christoph Lameter
1 sibling, 2 replies; 14+ messages in thread
From: Andy Whitcroft @ 2006-09-15 13:28 UTC (permalink / raw)
To: Dave Hansen; +Cc: Christoph Lameter, linux-mm
Dave Hansen wrote:
> Now that I think about it, we should have room to encode that thing
> inside of the section number on 32-bit platforms.
>
> We have 32-bits of space, and we need to encode a number that is a
> maximum of 4 bits in size. That leaves 28 bits minus the one that we
> use for the section present bit. Our minimum section size on x86 is
> something like 64 or 128MB. Let's say 64MB. So, on a 64GB system, we
> only need 1k sections, and 10 bits.
>
> So, the node number would almost certainly fit in the existing
> mem_section. We'd just need to set it and mask it out.
>
> Andy, what do you think?
The flags field only has a 9 bit space for these value fields. Into
which we normally shove NODE,ZONE. With SPARSEMEM that is SECTION,ZONE
and so there is only room for 6-7 bits of information in this field.
The section table only contains an adjusted pointer to the mem_map for
that section? We use the bottom two bits of that pointer for a couple
of flags. I don't think there is any space in it.
Are you thinking of somewhere else?
-apw
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-15 13:28 ` Andy Whitcroft
@ 2006-09-15 16:32 ` Dave Hansen
2006-09-15 17:13 ` Christoph Lameter
1 sibling, 0 replies; 14+ messages in thread
From: Dave Hansen @ 2006-09-15 16:32 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: Christoph Lameter, linux-mm
On Fri, 2006-09-15 at 14:28 +0100, Andy Whitcroft wrote:
> The section table only contains an adjusted pointer to the mem_map for
> that section? We use the bottom two bits of that pointer for a couple
> of flags. I don't think there is any space in it.
For x86, we don't need very many bits. Maybe four. We also don't use a
very large number of sections on x86. That should leave space in the
mem_section[] pointer.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-15 13:28 ` Andy Whitcroft
2006-09-15 16:32 ` Dave Hansen
@ 2006-09-15 17:13 ` Christoph Lameter
2006-09-15 17:51 ` Christoph Lameter
1 sibling, 1 reply; 14+ messages in thread
From: Christoph Lameter @ 2006-09-15 17:13 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: Dave Hansen, linux-mm
On Fri, 15 Sep 2006, Andy Whitcroft wrote:
> The flags field only has a 9 bit space for these value fields. Into
> which we normally shove NODE,ZONE. With SPARSEMEM that is SECTION,ZONE
> and so there is only room for 6-7 bits of information in this field.
>
> The section table only contains an adjusted pointer to the mem_map for
> that section? We use the bottom two bits of that pointer for a couple
> of flags. I don't think there is any space in it.
Great! If we only have 6-7 bits that means a max of 128 sections, right?
And you have always less than 256 nodes? How about making the
section_to_nid array a byte vector? It will then fit into one cacheline
and be only little less hot than NODE_DATA() so we should be even faster
than before. The zone_table is currently certainly much larger than a
single cacheline.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Get rid of zone_table
2006-09-15 17:13 ` Christoph Lameter
@ 2006-09-15 17:51 ` Christoph Lameter
0 siblings, 0 replies; 14+ messages in thread
From: Christoph Lameter @ 2006-09-15 17:51 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: Dave Hansen, linux-mm
Optimize section_to_node_table so that it fits in a cacheline
We change the type of the elements in the section to node table
to u8 if we have less than 256 nodes in the system. That way
we can have up to 128 sections in one cacheline which is all
that is necessary for some 32 bit NUMA platforms like NUMAQ to
keep section_to_node_table in a single cacheline and thus
make page_to_zone as fast or faster than before.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.18-rc6-mm2/mm/sparse.c
===================================================================
--- linux-2.6.18-rc6-mm2.orig/mm/sparse.c 2006-09-15 12:43:12.000000000 -0500
+++ linux-2.6.18-rc6-mm2/mm/sparse.c 2006-09-15 12:50:20.857430106 -0500
@@ -30,7 +30,11 @@ EXPORT_SYMBOL(mem_section);
* do a lookup in the section_to_node_table in order to find which
* node the page belongs to.
*/
-static int section_to_node_table[NR_MEM_SECTIONS];
+#if MAX_NUMNODES <= 256
+static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#else
+static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
+#endif
extern unsigned long page_to_nid(struct page *page)
{
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2006-09-15 17:51 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-09-13 20:44 [PATCH] Get rid of zone_table Christoph Lameter
2006-09-13 20:53 ` Dave Hansen
2006-09-13 21:40 ` Christoph Lameter
2006-09-13 21:47 ` Dave Hansen
2006-09-13 21:54 ` Christoph Lameter
2006-09-13 21:58 ` Dave Hansen
2006-09-13 22:02 ` Christoph Lameter
2006-09-15 13:28 ` Andy Whitcroft
2006-09-15 16:32 ` Dave Hansen
2006-09-15 17:13 ` Christoph Lameter
2006-09-15 17:51 ` Christoph Lameter
2006-09-14 10:33 ` Andy Whitcroft
2006-09-14 21:46 ` Christoph Lameter
2006-09-15 13:07 ` Andy Whitcroft
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox