* [PATCH 1/8] sparsemem: clean up spelling error in comments
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
@ 2007-05-22 22:58 ` Andy Whitcroft
2007-05-22 22:58 ` [PATCH 2/8] sparsemem: record when a section has a valid mem_map Andy Whitcroft
` (10 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 22:58 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/mm/sparse.c b/mm/sparse.c
index cb105a6..caa7e1b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -101,7 +101,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
/*
* Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
+ * to also work for the flat array case because
* NR_SECTION_ROOTS==NR_MEM_SECTIONS.
*/
int __section_nr(struct mem_section* ms)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 2/8] sparsemem: record when a section has a valid mem_map
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
2007-05-22 22:58 ` [PATCH 1/8] sparsemem: clean up spelling error in comments Andy Whitcroft
@ 2007-05-22 22:58 ` Andy Whitcroft
2007-05-22 22:59 ` [PATCH 3/8] Generic Virtual Memmap support for SPARSEMEM V4 Andy Whitcroft
` (9 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 22:58 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
We have flags to indicate whether a section actually has a valid
mem_map associated with it. This is never set and we rely solely
on the present bit to indicate a section is valid. By definition
a section is not valid if it has no mem_map and there is a window
during init where the present bit is set but there is no mem_map,
during which pfn_valid() will return true incorrectly.
Use the existing SECTION_HAS_MEM_MAP flag to indicate the presence
of a valid mem_map. Switch valid_section{,_nr} and pfn_valid()
to this bit. Add a new present_section{,_nr} and pfn_present()
interfaces for those users who care to know that a section is going
to be valid.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 74b9679..f1f0af8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -239,7 +239,7 @@ store_mem_state(struct sys_device *dev, const char *buf, size_t count)
mem = container_of(dev, struct memory_block, sysdev);
phys_section_nr = mem->phys_index;
- if (!valid_section_nr(phys_section_nr))
+ if (!present_section_nr(phys_section_nr))
goto out;
if (!strncmp(buf, "online", min((int)count, 6)))
@@ -419,7 +419,7 @@ int register_new_memory(struct mem_section *section)
int unregister_memory_section(struct mem_section *section)
{
- if (!valid_section(section))
+ if (!present_section(section))
return -EINVAL;
return remove_memory_block(0, section, 0);
@@ -444,7 +444,7 @@ int __init memory_dev_init(void)
* during boot and have been initialized
*/
for (i = 0; i < NR_MEM_SECTIONS; i++) {
- if (!valid_section_nr(i))
+ if (!present_section_nr(i))
continue;
err = add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0);
if (!ret)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6609481..5a3ea4d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -781,12 +781,17 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section)
return (struct page *)map;
}
-static inline int valid_section(struct mem_section *section)
+static inline int present_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}
-static inline int section_has_mem_map(struct mem_section *section)
+static inline int present_section_nr(unsigned long nr)
+{
+ return present_section(__nr_to_section(nr));
+}
+
+static inline int valid_section(struct mem_section *section)
{
return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}
@@ -808,6 +813,13 @@ static inline int pfn_valid(unsigned long pfn)
return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
}
+static inline int pfn_present(unsigned long pfn)
+{
+ if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
+ return 0;
+ return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
+}
+
/*
* These are _only_ used during initialisation, therefore they
* can use __initdata ... They could have names to indicate
diff --git a/mm/sparse.c b/mm/sparse.c
index caa7e1b..d64e628 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -170,7 +170,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
if (nid != early_pfn_to_nid(pfn))
continue;
- if (pfn_valid(pfn))
+ if (pfn_present(pfn))
nr_pages += PAGES_PER_SECTION;
}
@@ -201,11 +201,12 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
unsigned long *pageblock_bitmap)
{
- if (!valid_section(ms))
+ if (!present_section(ms))
return -EINVAL;
ms->section_mem_map &= ~SECTION_MAP_MASK;
- ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
+ SECTION_HAS_MEM_MAP;
ms->pageblock_flags = pageblock_bitmap;
return 1;
@@ -308,7 +309,7 @@ void __init sparse_init(void)
unsigned long *usemap;
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
- if (!valid_section_nr(pnum))
+ if (!present_section_nr(pnum))
continue;
map = sparse_early_mem_map_alloc(pnum);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 3/8] Generic Virtual Memmap support for SPARSEMEM V4
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
2007-05-22 22:58 ` [PATCH 1/8] sparsemem: clean up spelling error in comments Andy Whitcroft
2007-05-22 22:58 ` [PATCH 2/8] sparsemem: record when a section has a valid mem_map Andy Whitcroft
@ 2007-05-22 22:59 ` Andy Whitcroft
2007-05-23 5:15 ` Christoph Lameter
2007-05-22 23:00 ` [PATCH 4/8] x86_64: SPARSEMEM_VMEMMAP 2M page size support Andy Whitcroft
` (8 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 22:59 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
SPARSEMEM is a pretty nice framework that unifies quite a bit of
code over all the arches. It would be great if it could be the
default so that we can get rid of various forms of DISCONTIG and
other variations on memory maps. So far what has hindered this are
the additional lookups that SPARSEMEM introduces for virt_to_page
and page_address. This goes so far that the code to do this has to
be kept in a separate function and cannot be used inline.
This patch introduces a virtual memmap mode for SPARSEMEM, in which
the memmap is mapped into a virtually contigious area, only the
active sections are physically backed. This allows virt_to_page
page_address and cohorts become simple shift/add operations.
No page flag fields, no table lookups, nothing involving memory
is required.
The two key operations pfn_to_page and page_to_page become:
#define __pfn_to_page(pfn) (vmemmap + (pfn))
#define __page_to_pfn(page) ((page) - vmemmap)
By having a virtual mapping for the memmap we allow simple access
without wasting physical memory. As kernel memory is typically
already mapped 1:1 this introduces no additional overhead.
The virtual mapping must be big enough to allow a struct page to
be allocated and mapped for all valid physical pages. This vill
make a virtual memmap difficult to use on 32 bit platforms that
support 36 address bits.
However, if there is enough virtual space available and the arch
already maps its 1-1 kernel space using TLBs (f.e. true of IA64
and x86_64) then this technique makes SPARSEMEM lookups even more
efficient than CONFIG_FLATMEM. FLATMEM needs to read the contents
of the mem_map variable to get the start of the memmap and then add
the offset to the required entry. vmemmap is a constant to which
we can simply add the offset.
This patch has the potential to allow us to make SPARSMEM the default
(and even the only) option for most systems. It should be optimal
on UP, SMP and NUMA on most platforms. Then we may even be able
to remove the other memory models: FLATMEM, DISCONTIG etc.
[apw@shadowen.org: config cleanups, resplit code etc]
From: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index 30d8d33..52226e1 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -46,6 +46,12 @@
__pgdat->node_start_pfn; \
})
+#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
+
+/* memmap is virtually contigious. */
+#define __pfn_to_page(pfn) (vmemmap + (pfn))
+#define __page_to_pfn(page) ((page) - vmemmap)
+
#elif defined(CONFIG_SPARSEMEM)
/*
* Note: section's mem_map is encorded to reflect its start_pfn.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8c7b7d3..d440ed5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1254,5 +1254,10 @@ extern int randomize_va_space;
__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma);
+int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+int vmemmap_populate_pmd(pud_t *, unsigned long, unsigned long, int);
+void *vmemmap_alloc_block(unsigned long size, int node);
+void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/mm/sparse.c b/mm/sparse.c
index d64e628..3b75166 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,8 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
/*
* Permanent SPARSEMEM data:
@@ -212,6 +214,192 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
return 1;
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Virtual Memory Map support
+ *
+ * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ *
+ * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
+ * virt_to_page, page_address() to be implemented as a base offset
+ * calculation without memory access.
+ *
+ * However, virtual mappings need a page table and TLBs. Many Linux
+ * architectures already map their physical space using 1-1 mappings
+ * via TLBs. For those arches the virtual memmory map is essentially
+ * for free if we use the same page size as the 1-1 mappings. In that
+ * case the overhead consists of a few additional pages that are
+ * allocated to create a view of memory for vmemmap.
+ *
+ * Special Kconfig settings:
+ *
+ * CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
+ *
+ * The architecture has its own functions to populate the memory
+ * map and provides a vmemmap_populate function.
+ *
+ * CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD
+ *
+ * The architecture provides functions to populate the pmd level
+ * of the vmemmap mappings. Allowing mappings using large pages
+ * where available.
+ *
+ * If neither are set then PAGE_SIZE mappings are generated which
+ * require one PTE/TLB per PAGE_SIZE chunk of the virtual memory map.
+ */
+
+/*
+ * Allocate a block of memory to be used to back the virtual memory map
+ * or to back the page tables that are used to create the mapping.
+ * Uses the main allocators if they are available, else bootmem.
+ */
+void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+{
+ /* If the main allocator is up use that, fallback to bootmem. */
+ if (slab_is_available()) {
+ struct page *page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO, get_order(size));
+ if (page)
+ return page_address(page);
+ return NULL;
+ } else
+ return __alloc_bootmem_node(NODE_DATA(node), size, size,
+ __pa(MAX_DMA_ADDRESS));
+}
+
+#ifndef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
+void __meminit vmemmap_verify(pte_t *pte, int node,
+ unsigned long start, unsigned long end)
+{
+ unsigned long pfn = pte_pfn(*pte);
+ int actual_node = early_pfn_to_nid(pfn);
+
+ if (actual_node != node)
+ printk(KERN_WARNING "[%lx-%lx] potential offnode "
+ "page_structs\n", start, end - 1);
+}
+
+#ifndef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD
+static int __meminit vmemmap_populate_pte(pmd_t *pmd, unsigned long addr,
+ unsigned long end, int node)
+{
+ pte_t *pte;
+
+ for (pte = pte_offset_map(pmd, addr); addr < end;
+ pte++, addr += PAGE_SIZE)
+ if (pte_none(*pte)) {
+ pte_t entry;
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ set_pte(pte, entry);
+
+ printk(KERN_DEBUG "[%lx-%lx] PTE ->%p on node %d\n",
+ addr, addr + PAGE_SIZE - 1, p, node);
+
+ } else
+ vmemmap_verify(pte, node, addr + PAGE_SIZE, end);
+
+ return 0;
+}
+
+int __meminit vmemmap_populate_pmd(pud_t *pud, unsigned long addr,
+ unsigned long end, int node)
+{
+ pmd_t *pmd;
+ int error = 0;
+
+ for (pmd = pmd_offset(pud, addr); addr < end && !error;
+ pmd++, addr += PMD_SIZE) {
+ if (pmd_none(*pmd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, pmd, p);
+ } else
+ vmemmap_verify((pte_t *)pmd, node,
+ pmd_addr_end(addr, end), end);
+
+ error = vmemmap_populate_pte(pmd, addr,
+ pmd_addr_end(addr, end), node);
+ }
+ return error;
+}
+#endif /* CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD */
+
+static int __meminit vmemmap_populate_pud(pgd_t *pgd, unsigned long addr,
+ unsigned long end, int node)
+{
+ pud_t *pud;
+ int error = 0;
+
+ for (pud = pud_offset(pgd, addr); addr < end && !error;
+ pud++, addr += PUD_SIZE) {
+ if (pud_none(*pud)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ pud_populate(&init_mm, pud, p);
+ }
+ error = vmemmap_populate_pmd(pud, addr,
+ pud_addr_end(addr, end), node);
+ }
+ return error;
+}
+
+int __meminit vmemmap_populate(struct page *start_page,
+ unsigned long nr, int node)
+{
+ pgd_t *pgd;
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + nr);
+ int error = 0;
+
+ printk(KERN_DEBUG "[%lx-%lx] Virtual memory section"
+ " (%ld pages) node %d\n", addr, end - 1, nr, node);
+
+ for (pgd = pgd_offset_k(addr); addr < end && !error;
+ pgd++, addr += PGDIR_SIZE) {
+ if (pgd_none(*pgd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ pgd_populate(&init_mm, pgd, p);
+ }
+ error = vmemmap_populate_pud(pgd, addr,
+ pgd_addr_end(addr, end), node);
+ }
+ return error;
+}
+#endif /* !CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP */
+
+static struct page * __init sparse_early_mem_map_alloc(unsigned long pnum)
+{
+ struct page *map;
+ struct mem_section *ms = __nr_to_section(pnum);
+ int nid = sparse_early_nid(ms);
+ int error;
+
+ map = pfn_to_page(pnum * PAGES_PER_SECTION);
+ error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
+ if (error) {
+ printk(KERN_ERR "%s: allocation failed. Error=%d\n",
+ __FUNCTION__, error);
+ printk(KERN_ERR "%s: virtual memory map backing failed "
+ "some memory will not be available.\n", __FUNCTION__);
+ ms->section_mem_map = 0;
+ return NULL;
+ }
+ return map;
+}
+
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+
static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -231,6 +419,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
ms->section_mem_map = 0;
return NULL;
}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 4/8] x86_64: SPARSEMEM_VMEMMAP 2M page size support
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (2 preceding siblings ...)
2007-05-22 22:59 ` [PATCH 3/8] Generic Virtual Memmap support for SPARSEMEM V4 Andy Whitcroft
@ 2007-05-22 23:00 ` Andy Whitcroft
2007-05-22 23:00 ` [PATCH 5/8] IA64: SPARSEMEM_VMEMMAP 16K " Andy Whitcroft
` (7 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 23:00 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
x86_64 uses 2M page table entries to map its 1-1 kernel space.
We also implement the virtual memmap using 2M page table entries. So
there is no additional runtime overhead over FLATMEM, initialisation
is slightly more complex. As FLATMEM still references memory to
obtain the mem_map pointer and SPARSEMEM_VMEMMAP uses a compile
time constant, SPARSEMEM_VMEMMAP should be superior.
With this SPARSEMEM becomes the most efficient way of handling
virt_to_page, pfn_to_page and friends for UP, SMP and NUMA on x86_64.
[apw@shadowen.org: code resplit, style fixups]
From: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/Documentation/x86_64/mm.txt b/Documentation/x86_64/mm.txt
index f42798e..b89b6d2 100644
--- a/Documentation/x86_64/mm.txt
+++ b/Documentation/x86_64/mm.txt
@@ -9,6 +9,7 @@ ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
ffff810000000000 - ffffc0ffffffffff (=46 bits) direct mapping of all phys. memory
ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0
... unused hole ...
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 2926670..029815a 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -400,6 +400,14 @@ config ARCH_SPARSEMEM_ENABLE
def_bool y
depends on (NUMA || EXPERIMENTAL)
+config SPARSEMEM_VMEMMAP
+ def_bool y
+ depends on SPARSEMEM
+
+config ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD
+ def_bool y
+ depends on SPARSEMEM_VMEMMAP
+
config ARCH_MEMORY_PROBE
def_bool y
depends on MEMORY_HOTPLUG
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 58db7af..a7591d3 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -770,3 +770,33 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return "[vsyscall]";
return NULL;
}
+
+#ifdef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP_PMD
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+int __meminit vmemmap_populate_pmd(pud_t *pud, unsigned long addr,
+ unsigned long end, int node)
+{
+ pmd_t *pmd;
+
+ for (pmd = pmd_offset(pud, addr); addr < end;
+ pmd++, addr += PMD_SIZE)
+ if (pmd_none(*pmd)) {
+ pte_t entry;
+ void *p = vmemmap_alloc_block(PMD_SIZE, node);
+ if (!p)
+ return -ENOMEM;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+ mk_pte_huge(entry);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
+ addr, addr + PMD_SIZE - 1, p, node);
+ } else
+ vmemmap_verify((pte_t *)pmd, node,
+ pmd_addr_end(addr, end), end);
+ return 0;
+}
+#endif
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 88adf1a..c3b52bc 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -134,6 +134,7 @@ extern unsigned long __phys_addr(unsigned long);
VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define __HAVE_ARCH_GATE_AREA 1
+#define vmemmap ((struct page *)VMEMMAP_START)
#include <asm-generic/memory_model.h>
#include <asm-generic/page.h>
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 44d07bf..15f0003 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -137,6 +137,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
#define MAXMEM _AC(0x3fffffffffff, UL)
#define VMALLOC_START _AC(0xffffc20000000000, UL)
#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffe20000000000, UL)
#define MODULES_VADDR _AC(0xffffffff88000000, UL)
#define MODULES_END _AC(0xfffffffffff00000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 5/8] IA64: SPARSEMEM_VMEMMAP 16K page size support
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (3 preceding siblings ...)
2007-05-22 23:00 ` [PATCH 4/8] x86_64: SPARSEMEM_VMEMMAP 2M page size support Andy Whitcroft
@ 2007-05-22 23:00 ` Andy Whitcroft
2007-05-23 5:12 ` Christoph Lameter
2007-05-22 23:01 ` [PATCH 6/8] IA64: SPARSEMEM_VMEMMAP 16M " Andy Whitcroft
` (6 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 23:00 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
Equip IA64 sparsemem with a virtual memmap. This is similar to the
existing CONFIG_VIRTUAL_MEM_MAP functionality for DISCONTIGMEM.
It uses a PAGE_SIZE mapping.
This is provided as a minimally intrusive solution. We split the
128TB VMALLOC area into two 64TB areas and use one for the virtual
memmap.
This should replace CONFIG_VIRTUAL_MEM_MAP long term.
From: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index de1bff6..9d0d101 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -355,6 +355,10 @@ config ARCH_SPARSEMEM_ENABLE
def_bool y
depends on ARCH_DISCONTIGMEM_ENABLE
+config SPARSEMEM_VMEMMAP
+ def_bool y
+ depends on SPARSEMEM
+
config ARCH_DISCONTIGMEM_DEFAULT
def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB)
depends on ARCH_DISCONTIGMEM_ENABLE
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 670b706..366c34b 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -236,8 +236,14 @@ ia64_phys_addr_valid (unsigned long addr)
# define VMALLOC_END vmalloc_end
extern unsigned long vmalloc_end;
#else
+#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+/* SPARSEMEM_VMEMMAP uses half of vmalloc... */
+# define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 10)))
+# define vmemmap ((struct page *)VMALLOC_END)
+#else
# define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
#endif
+#endif
/* fs/proc/kcore.c */
#define kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE))
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 6/8] IA64: SPARSEMEM_VMEMMAP 16M page size support
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (4 preceding siblings ...)
2007-05-22 23:00 ` [PATCH 5/8] IA64: SPARSEMEM_VMEMMAP 16K " Andy Whitcroft
@ 2007-05-22 23:01 ` Andy Whitcroft
2007-05-22 23:01 ` [PATCH 7/8] SPARC64: SPARSEMEM_VMEMMAP support Andy Whitcroft
` (5 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 23:01 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
This implements granule page sized vmemmap support for IA64. This is
important because the traditional vmemmap on IA64 uses page size for
mapping the TLB. For a typical 8GB node on IA64 we need about
(33 - 14 + 6 = 25) = 32 MB of page structs.
Using page size we will end up with (25 - 14 = 11) 2048 page table entries.
This patch will reduce this to two 16MB TLBs. So its a factor
of 1000 less TLBs for the virtual memory map.
We modify the alt_dtlb_miss handler to branch to a vmemmap TLB lookup
function if bit 60 is set. The vmemmap will start with 0xF000xxx so its
going be very distinctive in dumps and can be distinguished easily from
0xE000xxx (kernel 1-1 area) and 0xA000xxx (kernel text, data and vmalloc).
We use a 1 level page table to do lookups for the vmemmap TLBs. Since
we need to cover 1 Petabyte we need to reserve 1 megabyte just for
the table but we can statically allocate it in the data segment. This
simplifies lookups and handling. The fault handler only has to do
a single lookup in contrast to 4 for the current vmalloc/vmemmap
implementation.
Problems with this patchset are:
1. Large 1M array required to cover all of possible memory (1 Petabyte).
Maybe reduce this to actually supported HW sizes? 16TB or 64TB?
2. For systems with small nodes there is a significant chance of
large overlaps. We could dynamically determine the TLB size
but that would make the code more complex.
[apw@shadowen.org: style fixups]
From: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 9d0d101..e8fc8e3 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -359,6 +359,18 @@ config SPARSEMEM_VMEMMAP
def_bool y
depends on SPARSEMEM
+config ARCH_POPULATES_SPARSEMEM_VMEMMAP
+ bool "Use 16M TLB for virtual memory map"
+ default y
+ depends on SPARSEMEM_VMEMMAP
+ help
+ Enables large page virtual memmap support. Each virtual memmap
+ page will be 16MB in size. That size of vmemmap can cover 4GB
+ of memory. We only use a single TLB per node. However, if nodes
+ are small and the distance between the memory of the nodes is
+ < 4GB then the page struct for some of the early pages in the
+ node may end up on the prior node.
+
config ARCH_DISCONTIGMEM_DEFAULT
def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB)
depends on ARCH_DISCONTIGMEM_ENABLE
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index 34f44d8..b6deaf7 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -391,9 +391,11 @@ ENTRY(alt_dtlb_miss)
tbit.z p12,p0=r16,61 // access to region 6?
mov r25=PERCPU_PAGE_SHIFT << 2
mov r26=PERCPU_PAGE_SIZE
- nop.m 0
- nop.b 0
+ tbit.nz p6,p0=r16,60 // Access to VMEMMAP?
+(p6) br.cond.dptk vmemmap
;;
+dtlb_continue:
+ .pred.rel "mutex", p11, p10
(p10) mov r19=IA64_KR(PER_CPU_DATA)
(p11) and r19=r19,r16 // clear non-ppn fields
extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl
@@ -416,6 +418,37 @@ ENTRY(alt_dtlb_miss)
(p7) itc.d r19 // insert the TLB entry
mov pr=r31,-1
rfi
+
+vmemmap:
+ //
+ // Granule lookup via vmemmap_table for
+ // the virtual memory map.
+ //
+ tbit.nz p6,p0=r16,59 // more top bits set?
+(p6) br.cond.spnt dtlb_continue // then its mmu bootstrap
+ ;;
+ rsm psr.dt // switch to using physical data addressing
+ extr.u r25=r16, IA64_GRANULE_SHIFT, 32
+ ;;
+ srlz.d
+ LOAD_PHYSICAL(p0, r26, vmemmap_table)
+ shl r25=r25,2
+ ;;
+ add r26=r26,r25 // Index into vmemmap table
+ ;;
+ ld4 r25=[r26] // Get 32 bit descriptor */
+ ;;
+ dep.z r19=r25, 0, 31 // Isolate ppn
+ tbit.z p6,p0=r25, 31 // Present bit set?
+(p6) br.cond.spnt page_fault // Page not present
+ ;;
+ shl r19=r19, IA64_GRANULE_SHIFT // Shift ppn in place
+ ;;
+ or r19=r19,r17 // insert PTE control bits into r19
+ ;;
+ itc.d r19 // insert the TLB entry
+ mov pr=r31,-1
+ rfi
END(alt_dtlb_miss)
.org ia64_ivt+0x1400
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index e14916b..7c38908 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -8,6 +8,8 @@
* Russ Anderson <rja@sgi.com>
* Jesse Barnes <jbarnes@sgi.com>
* Jack Steiner <steiner@sgi.com>
+ * Copyright (C) 2007 sgi
+ * Christoph Lameter <clameter@sgi.com>
*/
/*
@@ -44,6 +46,77 @@ struct early_node_data {
unsigned long max_pfn;
};
+#ifdef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
+/*
+ * The vmemmap_table contains the number of the granule used to map
+ * that section of the virtual memmap.
+ *
+ * We support 50 address bits, 14 bits are used for the page size. This
+ * leaves 36 bits (64G) for the pfn. Using page structs the memmap is going
+ * to take up a bit less than 4TB of virtual space.
+ *
+ * We are mapping these 4TB using 16M granule size which makes us end up
+ * with a bit less than 256k entries.
+ *
+ * Thus the common size of the needed vmemmap_table will be less than 1M.
+ */
+
+#define VMEMMAP_SIZE GRANULEROUNDUP((1UL << (MAX_PHYSMEM_BITS - PAGE_SHIFT)) \
+ * sizeof(struct page))
+
+/*
+ * Each vmemmap_table entry describes a 16M block of memory. We have
+ * 32 bit here and use one bit to indicate that a page is present.
+ * 31 bit physical page number + 24 bit index within the page = 55 bits
+ * which is larger than the current maximum of memory (1 Petabyte)
+ * supported by IA64.
+ */
+
+#define VMEMMAP_PRESENT (1UL << 31)
+
+u32 vmemmap_table[VMEMMAP_SIZE >> IA64_GRANULE_SHIFT];
+
+int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
+{
+ unsigned long phys_start = __pa(start) & ~VMEMMAP_FLAG;
+ unsigned long phys_end = __pa(start + nr) & ~VMEMMAP_FLAG;
+ unsigned long addr = GRANULEROUNDDOWN(phys_start);
+ unsigned long end = GRANULEROUNDUP(phys_end);
+
+ for (; addr < end; addr += IA64_GRANULE_SIZE) {
+ u32 *vmem_pp = vmemmap_table + (addr >> IA64_GRANULE_SHIFT);
+ void *block;
+
+ if (*vmem_pp & VMEMMAP_PRESENT) {
+ unsigned long addr = *vmem_pp & ~VMEMMAP_PRESENT;
+ int actual_node;
+
+ actual_node = early_pfn_to_nid(addr >> PAGE_SHIFT);
+ if (actual_node != node)
+ printk(KERN_WARNING "Virtual memory segments "
+ "on node %d instead of %d",
+ actual_node, node);
+ } else {
+ block = vmemmap_alloc_block(IA64_GRANULE_SIZE, node);
+ if (!block)
+ return -ENOMEM;
+
+ *vmem_pp = VMEMMAP_PRESENT |
+ (__pa(block) >> IA64_GRANULE_SHIFT);
+
+ printk(KERN_INFO "[%p-%p] page_structs=%lu "
+ "node=%d entry=%lu/%lu\n", start, block, nr,
+ node, addr >> IA64_GRANULE_SHIFT,
+ VMEMMAP_SIZE >> IA64_GRANULE_SHIFT);
+ }
+ }
+ return 0;
+}
+#else
+/* Satisfy reference in arch/ia64/kernel/ivt.S */
+u32 vmemmap_table[0];
+#endif
+
static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
static nodemask_t memory_less_mask __initdata;
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 366c34b..f4aab5d 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -236,7 +236,8 @@ ia64_phys_addr_valid (unsigned long addr)
# define VMALLOC_END vmalloc_end
extern unsigned long vmalloc_end;
#else
-#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP)
+#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP) && \
+ !defined(CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP)
/* SPARSEMEM_VMEMMAP uses half of vmalloc... */
# define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 10)))
# define vmemmap ((struct page *)VMALLOC_END)
@@ -245,6 +246,11 @@ ia64_phys_addr_valid (unsigned long addr)
#endif
#endif
+#ifdef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
+# define VMEMMAP_FLAG (1UL << 60)
+# define vmemmap ((struct page *)(RGN_BASE(RGN_KERNEL) | VMEMMAP_FLAG))
+#endif
+
/* fs/proc/kcore.c */
#define kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE))
#define kc_offset_to_vaddr(o) ((o) + RGN_BASE(RGN_GATE))
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 7/8] SPARC64: SPARSEMEM_VMEMMAP support
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (5 preceding siblings ...)
2007-05-22 23:01 ` [PATCH 6/8] IA64: SPARSEMEM_VMEMMAP 16M " Andy Whitcroft
@ 2007-05-22 23:01 ` Andy Whitcroft
2007-05-22 23:02 ` [PATCH 8/8] ppc64: " Andy Whitcroft
` (4 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 23:01 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
Hey Christoph, here is sparc64 support for this stuff.
After implementing this and seeing more and more how it works, I
really like it :-)
Thanks a lot for doing this work Christoph!
[apw@shadowen.org: style fixups]
From: David Miller <davem@davemloft.net>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 831781c..a81979d 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -224,10 +224,17 @@ config ARCH_SPARSEMEM_ENABLE
config ARCH_SPARSEMEM_DEFAULT
def_bool y
- select SPARSEMEM_STATIC
source "mm/Kconfig"
+config SPARSEMEM_VMEMMAP
+ def_bool y
+ depends on SPARSEMEM
+
+config ARCH_POPULATES_SPARSEMEM_VMEMMAP
+ def_bool y
+ depends on SPARSEMEM_VMEMMAP
+
config ISA
bool
help
diff --git a/arch/sparc64/kernel/ktlb.S b/arch/sparc64/kernel/ktlb.S
index d4024ac..964527d 100644
--- a/arch/sparc64/kernel/ktlb.S
+++ b/arch/sparc64/kernel/ktlb.S
@@ -226,6 +226,15 @@ kvmap_dtlb_load:
ba,pt %xcc, sun4v_dtlb_load
mov %g5, %g3
+kvmap_vmemmap:
+ sub %g4, %g5, %g5
+ srlx %g5, 22, %g5
+ sethi %hi(vmemmap_table), %g1
+ sllx %g5, 3, %g5
+ or %g1, %lo(vmemmap_table), %g1
+ ba,pt %xcc, kvmap_dtlb_load
+ ldx [%g1 + %g5], %g5
+
kvmap_dtlb_nonlinear:
/* Catch kernel NULL pointer derefs. */
sethi %hi(PAGE_SIZE), %g5
@@ -233,6 +242,13 @@ kvmap_dtlb_nonlinear:
bleu,pn %xcc, kvmap_dtlb_longpath
nop
+ /* Do not use the TSB for vmemmap. */
+ mov (VMEMMAP_BASE >> 24), %g5
+ sllx %g5, 24, %g5
+ cmp %g4,%g5
+ bgeu,pn %xcc, kvmap_vmemmap
+ nop
+
KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
kvmap_dtlb_tsbmiss:
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 6e5b01d..6ee87cf 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1639,6 +1639,58 @@ EXPORT_SYMBOL(_PAGE_E);
unsigned long _PAGE_CACHE __read_mostly;
EXPORT_SYMBOL(_PAGE_CACHE);
+#ifdef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
+
+#define VMEMMAP_CHUNK_SHIFT 22
+#define VMEMMAP_CHUNK (1UL << VMEMMAP_CHUNK_SHIFT)
+#define VMEMMAP_CHUNK_MASK ~(VMEMMAP_CHUNK - 1UL)
+#define VMEMMAP_ALIGN(x) (((x)+VMEMMAP_CHUNK-1UL)&VMEMMAP_CHUNK_MASK)
+
+#define VMEMMAP_SIZE ((((1UL << MAX_PHYSADDR_BITS) >> PAGE_SHIFT) * \
+ sizeof(struct page *)) >> VMEMMAP_CHUNK_SHIFT)
+unsigned long vmemmap_table[VMEMMAP_SIZE];
+
+int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
+{
+ unsigned long vstart = (unsigned long) start;
+ unsigned long vend = (unsigned long) (start + nr);
+ unsigned long phys_start = (vstart - VMEMMAP_BASE);
+ unsigned long phys_end = (vend - VMEMMAP_BASE);
+ unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
+ unsigned long end = VMEMMAP_ALIGN(phys_end);
+ unsigned long pte_base;
+
+ pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4U |
+ _PAGE_CP_4U | _PAGE_CV_4U |
+ _PAGE_P_4U | _PAGE_W_4U);
+ if (tlb_type == hypervisor)
+ pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
+ _PAGE_CP_4V | _PAGE_CV_4V |
+ _PAGE_P_4V | _PAGE_W_4V);
+
+ for (; addr < end; addr += VMEMMAP_CHUNK) {
+ unsigned long *vmem_pp =
+ vmemmap_table + (addr >> VMEMMAP_CHUNK_SHIFT);
+ void *block;
+
+ if (!(*vmem_pp & _PAGE_VALID)) {
+ block = vmemmap_alloc_block(1UL << 22, node);
+ if (!block)
+ return -ENOMEM;
+
+ *vmem_pp = pte_base | __pa(block);
+
+ printk(KERN_INFO "[%p-%p] page_structs=%lu "
+ "node=%d entry=%lu/%lu\n", start, block, nr,
+ node,
+ addr >> VMEMMAP_CHUNK_SHIFT,
+ VMEMMAP_SIZE >> VMEMMAP_CHUNK_SHIFT);
+ }
+ }
+ return 0;
+}
+#endif /* CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP */
+
static void prot_init_common(unsigned long page_none,
unsigned long page_shared,
unsigned long page_copy,
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index 9e80ad4..5f5d9fb 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -42,6 +42,9 @@
#define HI_OBP_ADDRESS _AC(0x0000000100000000,UL)
#define VMALLOC_START _AC(0x0000000100000000,UL)
#define VMALLOC_END _AC(0x0000000200000000,UL)
+#define VMEMMAP_BASE _AC(0x0000000200000000,UL)
+
+#define vmemmap ((struct page *)VMEMMAP_BASE)
/* XXX All of this needs to be rethought so we can take advantage
* XXX cheetah's full 64-bit virtual address space, ie. no more hole
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 8/8] ppc64: SPARSEMEM_VMEMMAP support
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (6 preceding siblings ...)
2007-05-22 23:01 ` [PATCH 7/8] SPARC64: SPARSEMEM_VMEMMAP support Andy Whitcroft
@ 2007-05-22 23:02 ` Andy Whitcroft
2007-05-22 23:52 ` [PATCH 0/8] Sparsemem Virtual Memmap V4 Christoph Lameter
` (3 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Andy Whitcroft @ 2007-05-22 23:02 UTC (permalink / raw)
To: linux-mm
Cc: linux-arch, Nick Piggin, Christoph Lameter, Mel Gorman, Andy Whitcroft
Enable virtual memmap support for SPARSEMEM on PPC64 systems.
Slice a 16th off the end of the linear mapping space and use that
to hold the vmemmap. Uses the same size mapping as uses in the
linear 1:1 kernel mapping.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
---
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 56d3c0d..282838c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -523,6 +523,14 @@ config ARCH_POPULATES_NODE_MAP
source "mm/Kconfig"
+config SPARSEMEM_VMEMMAP
+ def_bool y
+ depends on SPARSEMEM
+
+config ARCH_POPULATES_SPARSEMEM_VMEMMAP
+ def_bool y
+ depends on SPARSEMEM_VMEMMAP
+
config ARCH_MEMORY_PROBE
def_bool y
depends on MEMORY_HOTPLUG
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 7312a26..2e38a43 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -183,3 +183,67 @@ void pgtable_cache_init(void)
NULL);
}
}
+
+#ifdef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP
+
+/*
+ * Convert an address within the vmemmap into a pfn. Note that we have
+ * to do this by hand as the proffered address may not be correctly aligned.
+ * Subtraction of non-aligned pointers produces undefined results.
+ */
+#define VMM_SECTION(addr) \
+ (((((unsigned long)(addr)) - ((unsigned long)(vmemmap))) / \
+ sizeof(struct page)) >> PFN_SECTION_SHIFT)
+#define VMM_SECTION_PAGE(addr) (VMM_SECTION(addr) << PFN_SECTION_SHIFT)
+
+/*
+ * Check if this vmemmap page is already initialised. If any section
+ * which overlaps this vmemmap page is initialised then this page is
+ * initialised already.
+ */
+int __meminit vmemmap_populated(unsigned long start, int page_size)
+{
+ unsigned long end = start + page_size;
+
+ for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
+ if (pfn_valid(VMM_SECTION_PAGE(start)))
+ return 1;
+
+ return 0;
+}
+
+int __meminit vmemmap_populate(struct page *start_page,
+ unsigned long nr_pages, int node)
+{
+ unsigned long mode_rw;
+ unsigned long start = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + nr_pages);
+ unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift;
+
+ mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+
+ /* Align to the page size of the linear mapping. */
+ start = _ALIGN_DOWN(start, page_size);
+
+ for (; start < end; start += page_size) {
+ int mapped;
+ void *p;
+
+ if (vmemmap_populated(start, page_size))
+ continue;
+
+ p = vmemmap_alloc_block(page_size, node);
+ if (!p)
+ return -ENOMEM;
+
+ printk(KERN_WARNING "vmemmap %08lx allocated at %p, "
+ "physical %p.\n", start, p, __pa(p));
+
+ mapped = htab_bolt_mapping(start, start + page_size,
+ __pa(p), mode_rw, mmu_linear_psize);
+ BUG_ON(mapped < 0);
+ }
+
+ return 0;
+}
+#endif
diff --git a/include/asm-powerpc/pgtable-ppc64.h b/include/asm-powerpc/pgtable-ppc64.h
index 704c4e6..5943378 100644
--- a/include/asm-powerpc/pgtable-ppc64.h
+++ b/include/asm-powerpc/pgtable-ppc64.h
@@ -63,6 +63,14 @@ struct mm_struct;
#define USER_REGION_ID (0UL)
/*
+ * Defines the address of the vmemap area, in the top 16th of the
+ * kernel region.
+ */
+#define VMEMMAP_BASE (ASM_CONST(CONFIG_KERNEL_START) + \
+ (0xfUL << (REGION_SHIFT - 4)))
+#define vmemmap ((struct page *)VMEMMAP_BASE)
+
+/*
* Common bits in a linux-style PTE. These match the bits in the
* (hardware-defined) PowerPC PTE as closely as possible. Additional
* bits may be defined in pgtable-*.h
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 0/8] Sparsemem Virtual Memmap V4
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (7 preceding siblings ...)
2007-05-22 23:02 ` [PATCH 8/8] ppc64: " Andy Whitcroft
@ 2007-05-22 23:52 ` Christoph Lameter
2007-05-23 0:00 ` Christoph Lameter
` (2 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-05-22 23:52 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: linux-mm, linux-arch, Nick Piggin, Mel Gorman
On Tue, 22 May 2007, Andy Whitcroft wrote:
> It is worth noting that the ia64 support exposes an essentially
> private Kconfig option to allow selection of the two implementations.
> Once the 16Mb support is complete it should become the one and only
> implementation and that this option would no longer be exposed.
Right. You can omit 16MB support for the next round. We agreed with the
other IA64 people that 16MB is too large and want to shoot for 4MB.
The 16k support should be sufficient for this patchset.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 0/8] Sparsemem Virtual Memmap V4
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (8 preceding siblings ...)
2007-05-22 23:52 ` [PATCH 0/8] Sparsemem Virtual Memmap V4 Christoph Lameter
@ 2007-05-23 0:00 ` Christoph Lameter
2007-05-23 0:42 ` Christoph Lameter
2007-05-23 4:16 ` David Miller, Andy Whitcroft
11 siblings, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-05-23 0:00 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: linux-mm, linux-arch, Nick Piggin, Mel Gorman
On Tue, 22 May 2007, Andy Whitcroft wrote:
> I do not have performance data on this round of patches yet, but
> measurements on the initial PPC64 implementation showed a small
> but measurable improvement.
Well the performance tests that I did on x86_64 showed a reduction of the
performance of virt_to_page from 18us to 9us. So I think we are fine.
> This stack is against v2.6.22-rc1-mm1. It has been compile, boot
> and lightly tested on x86_64, ia64 and PPC64. Sparc64 as been
> compiled but not booted.
Can we get that into mm soon? There are potentially other arches that also
may want to run their own vmemmap functions for this and I would like to
have an easy way to tinker around with a 4M vmemmap size for IA64.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 0/8] Sparsemem Virtual Memmap V4
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (9 preceding siblings ...)
2007-05-23 0:00 ` Christoph Lameter
@ 2007-05-23 0:42 ` Christoph Lameter
2007-05-23 4:16 ` David Miller, Andy Whitcroft
11 siblings, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-05-23 0:42 UTC (permalink / raw)
To: Andy Whitcroft; +Cc: linux-mm, linux-ia64, linux-arch, Nick Piggin, Mel Gorman
We will need this fix to sparsemem on IA64. I hope this will not cause
other issues in sparsemem?
IA64: Increase maximum physmem size to cover 8 petabyte
We currently can support these large configurations only with Discontigmem.
Increase sparsemems max physmem bits to also be able to handle 8 petabyte.
Discontigmem supports up to 16 petabyte but I will need to use bit 53 to
flag vmemmap addresses for the TLB handler. It seems that the currently
used bit 60 for the 16M configuration is not supported by a certain
virtualization technique.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.22-rc2/include/asm-ia64/sparsemem.h
===================================================================
--- linux-2.6.22-rc2.orig/include/asm-ia64/sparsemem.h 2007-05-22 17:28:04.000000000 -0700
+++ linux-2.6.22-rc2/include/asm-ia64/sparsemem.h 2007-05-22 17:28:37.000000000 -0700
@@ -8,7 +8,7 @@
*/
#define SECTION_SIZE_BITS (30)
-#define MAX_PHYSMEM_BITS (50)
+#define MAX_PHYSMEM_BITS (53)
#ifdef CONFIG_FORCE_MAX_ZONEORDER
#if ((CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS)
#undef SECTION_SIZE_BITS
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 0/8] Sparsemem Virtual Memmap V4
2007-05-22 22:57 [PATCH 0/8] Sparsemem Virtual Memmap V4 Andy Whitcroft
` (10 preceding siblings ...)
2007-05-23 0:42 ` Christoph Lameter
@ 2007-05-23 4:16 ` David Miller, Andy Whitcroft
11 siblings, 0 replies; 16+ messages in thread
From: David Miller, Andy Whitcroft @ 2007-05-23 4:16 UTC (permalink / raw)
To: apw; +Cc: linux-mm, linux-arch, npiggin, clameter, mel
> This stack is against v2.6.22-rc1-mm1. It has been compile, boot
> and lightly tested on x86_64, ia64 and PPC64. Sparc64 as been
> compiled but not booted.
Sparc64 boot tested successfully on Niagara t1000 with 26 cpus.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 16+ messages in thread