* [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM
@ 2007-04-01 7:10 Christoph Lameter
2007-04-01 7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter
2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen
0 siblings, 2 replies; 46+ messages in thread
From: Christoph Lameter @ 2007-04-01 7:10 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Martin Bligh, linux-mm, Christoph Lameter,
Andi Kleen, KAMEZAWA Hiroyuki
Spare Virtual: Virtual Memmap support for SPARSEMEM
SPARSEMEM is a pretty nice framework that unifies quite a bit of
code over all the arches. It would be great if it could be the default
so that we can get rid of various forms of DISCONTIG and other variations
on memory maps. So far what has hindered this are the additional lookups
that SPARSEMEM introduces for virt_to_page and page_address. This goes
so far that the code to do this has to be kept in a separate function
and cannot be used inline.
This patch introduces virtual memmap support for sparsemem. virt_to_page
page_address and consorts become simple shift/add operations. No page flag
fields, no table lookups nothing involving memory is required.
The two key operations pfn_to_page and page_to_page become:
#define pfn_to_page(pfn) (vmemmap + (pfn))
#define page_to_pfn(page) ((page) - vmemmap)
In order for this to work we will have to use a virtual mapping.
These are usually for free since kernel memory is already mapped
via a 1-1 mapping requiring a page tabld. The virtual mapping must
be big enough to span all of memory that an arch support which may
make a virtual memmap difficult to use on funky 32 bit platforms
that support 36 address bits.
However, if there is enough virtual space available and the arch
already maps its 1-1 kernel space using TLBs (f.e. true of IA64
and x86_64) then this technique makes sparsemem lookups as efficient
as CONFIG_FLATMEM.
Maybe this patch will allow us to make SPARSEMEM the default
configuration that will work on UP, SMP and NUMA on most platforms?
Then we may hopefully be able to remove the various forms of support
for FLATMEM, DISCONTIG etc etc.
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Index: linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h
===================================================================
--- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h 2007-03-31 22:47:14.000000000 -0700
+++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h 2007-03-31 22:59:35.000000000 -0700
@@ -47,6 +47,13 @@
})
#elif defined(CONFIG_SPARSEMEM)
+#ifdef CONFIG_SPARSE_VIRTUAL
+/*
+ * We have a virtual memmap that makes lookups very simple
+ */
+#define __pfn_to_page(pfn) (vmemmap + (pfn))
+#define __page_to_pfn(page) ((page) - vmemmap)
+#else
/*
* Note: section's mem_map is encorded to reflect its start_pfn.
* section[i].section_mem_map == mem_map's address - start_pfn;
@@ -62,6 +69,7 @@
struct mem_section *__sec = __pfn_to_section(__pfn); \
__section_mem_map_addr(__sec) + __pfn; \
})
+#endif
#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
Index: linux-2.6.21-rc5-mm2/mm/sparse.c
===================================================================
--- linux-2.6.21-rc5-mm2.orig/mm/sparse.c 2007-03-31 22:47:14.000000000 -0700
+++ linux-2.6.21-rc5-mm2/mm/sparse.c 2007-03-31 22:59:35.000000000 -0700
@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <asm/dma.h>
+#include <asm/pgalloc.h>
/*
* Permanent SPARSEMEM data:
@@ -101,7 +102,7 @@ static inline int sparse_index_init(unsi
/*
* Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
+ * to also work for the flat array case because
* NR_SECTION_ROOTS==NR_MEM_SECTIONS.
*/
int __section_nr(struct mem_section* ms)
@@ -211,6 +212,90 @@ static int sparse_init_one_section(struc
return 1;
}
+#ifdef CONFIG_SPARSE_VIRTUAL
+
+void *vmemmap_alloc_block(unsigned long size, int node)
+{
+ if (slab_is_available()) {
+ struct page *page =
+ alloc_pages_node(node, GFP_KERNEL,
+ get_order(size));
+
+ BUG_ON(!page);
+ return page_address(page);
+ } else
+ return __alloc_bootmem_node(NODE_DATA(node), size, size,
+ __pa(MAX_DMA_ADDRESS));
+}
+
+
+#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP
+/*
+ * Virtual memmap populate functionality for architectures that support
+ * PMDs for huge pages like i386, x86_64 etc.
+ */
+static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr,
+ unsigned long end, int node)
+{
+ pmd_t *pmd;
+
+ end = pmd_addr_end(addr, end);
+
+ for (pmd = pmd_offset(pud, addr); addr < end;
+ pmd++, addr += PMD_SIZE) {
+ if (pmd_none(*pmd)) {
+ void *block;
+ pte_t pte;
+
+ block = vmemmap_alloc_block(PMD_SIZE, node);
+ pte = pfn_pte(__pa(block) >> PAGE_SHIFT,
+ PAGE_KERNEL);
+ pte_mkdirty(pte);
+ pte_mkwrite(pte);
+ pte_mkyoung(pte);
+ mk_pte_huge(pte);
+ set_pmd(pmd, __pmd(pte_val(pte)));
+ }
+ }
+}
+
+static void vmemmap_pop_pud(pgd_t *pgd, unsigned long addr,
+ unsigned long end, int node)
+{
+ pud_t *pud;
+
+ end = pud_addr_end(addr, end);
+ for (pud = pud_offset(pgd, addr); addr < end;
+ pud++, addr += PUD_SIZE) {
+
+ if (pud_none(*pud))
+ pud_populate(&init_mm, pud,
+ vmemmap_alloc_block(PAGE_SIZE, node));
+
+ vmemmap_pop_pmd(pud, addr, end, node);
+ }
+}
+
+static void vmemmap_populate(struct page *start_page, unsigned long nr,
+ int node)
+{
+ pgd_t *pgd;
+ unsigned long addr = (unsigned long)(start_page);
+ unsigned long end = pgd_addr_end(addr,
+ (unsigned long)((start_page + nr)));
+
+ for (pgd = pgd_offset_k(addr); addr < end;
+ pgd++, addr += PGDIR_SIZE) {
+
+ if (pgd_none(*pgd))
+ pgd_populate(&init_mm, pgd,
+ vmemmap_alloc_block(PAGE_SIZE, node));
+ vmemmap_pop_pud(pgd, addr, end, node);
+ }
+}
+#endif
+#endif /* CONFIG_SPARSE_VIRTUAL */
+
static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -221,8 +306,13 @@ static struct page *sparse_early_mem_map
if (map)
return map;
+#ifdef CONFIG_SPARSE_VIRTUAL
+ map = pfn_to_page(pnum * PAGES_PER_SECTION);
+ vmemmap_populate(map, PAGES_PER_SECTION, nid);
+#else
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
+#endif
if (map)
return map;
Index: linux-2.6.21-rc5-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.21-rc5-mm2.orig/include/linux/mmzone.h 2007-03-31 22:47:14.000000000 -0700
+++ linux-2.6.21-rc5-mm2/include/linux/mmzone.h 2007-03-31 23:01:03.000000000 -0700
@@ -836,6 +836,8 @@ void sparse_init(void);
void memory_present(int nid, unsigned long start, unsigned long end);
unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
+void vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+void *vmemmap_alloc_block(unsigned long size, int node);
/*
* If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 46+ messages in thread* [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-01 7:10 [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Christoph Lameter @ 2007-04-01 7:10 ` Christoph Lameter 2007-04-01 7:11 ` Christoph Lameter 2007-04-01 10:46 ` Andi Kleen 2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen 1 sibling, 2 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-01 7:10 UTC (permalink / raw) To: linux-kernel Cc: linux-arch, Martin Bligh, linux-mm, Andi Kleen, Christoph Lameter, KAMEZAWA Hiroyuki x86_64 make SPARSE_VIRTUAL the default x86_64 is using 2M page table entries to map its 1-1 kernel space. We implement the virtual memmap also using 2M page table entries. So there is no difference at all to FLATMEM. Both schemes require a page table and a TLB. Thus the SPARSEMEM becomes the most efficient way of handling virt_to_page, pfn_to_page and friends for UP, SMP and NUMA. So change the Kconfig for x86_64 to make SPARSE_VIRTUAL the default and switch off all other memory models. Oh. And PFN_TO_PAGE used to be out of line. Since it is now so simple switch it back to inline. Signed-off-by: Christoph Lameter <clameter@sgi.com> Index: linux-2.6.21-rc5-mm2/arch/x86_64/Kconfig =================================================================== --- linux-2.6.21-rc5-mm2.orig/arch/x86_64/Kconfig 2007-03-31 23:47:58.000000000 -0700 +++ linux-2.6.21-rc5-mm2/arch/x86_64/Kconfig 2007-03-31 23:48:41.000000000 -0700 @@ -380,25 +380,29 @@ config NUMA_EMU number of nodes. This is only useful for debugging. config ARCH_DISCONTIGMEM_ENABLE - bool - depends on NUMA - default y + def_bool n config ARCH_DISCONTIGMEM_DEFAULT - def_bool y - depends on NUMA + def_bool n config ARCH_SPARSEMEM_ENABLE def_bool y - depends on (NUMA || EXPERIMENTAL) + +config SPARSEMEM_MANUAL + def_bool y + +config ARCH_SPARSE_VIRTUAL + def_bool y + +config SELECT_MEMORY_MODEL + def_bool n config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG config ARCH_FLATMEM_ENABLE - def_bool y - depends on !NUMA + def_bool n source "mm/Kconfig" @@ -411,8 +415,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID depends on NUMA config OUT_OF_LINE_PFN_TO_PAGE - def_bool y - depends on DISCONTIGMEM + def_bool n config NR_CPUS int "Maximum number of CPUs (2-255)" Index: linux-2.6.21-rc5-mm2/include/asm-x86_64/page.h =================================================================== --- linux-2.6.21-rc5-mm2.orig/include/asm-x86_64/page.h 2007-03-31 23:47:58.000000000 -0700 +++ linux-2.6.21-rc5-mm2/include/asm-x86_64/page.h 2007-03-31 23:48:41.000000000 -0700 @@ -135,6 +135,7 @@ typedef struct { unsigned long pgprot; } VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define __HAVE_ARCH_GATE_AREA 1 +#define vmemmap ((struct page *)0xffffe20000000000UL) #include <asm-generic/memory_model.h> #include <asm-generic/page.h> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-01 7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter @ 2007-04-01 7:11 ` Christoph Lameter 2007-04-01 10:46 ` Andi Kleen 1 sibling, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-01 7:11 UTC (permalink / raw) To: linux-kernel Cc: linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki Argh. This should have been Patch 2/2. There is nothing in between. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-01 7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter 2007-04-01 7:11 ` Christoph Lameter @ 2007-04-01 10:46 ` Andi Kleen 2007-04-02 15:37 ` Christoph Lameter ` (2 more replies) 1 sibling, 3 replies; 46+ messages in thread From: Andi Kleen @ 2007-04-01 10:46 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Sunday 01 April 2007 09:10, Christoph Lameter wrote: > x86_64 make SPARSE_VIRTUAL the default > > x86_64 is using 2M page table entries to map its 1-1 kernel space. > We implement the virtual memmap also using 2M page table entries. > So there is no difference at all to FLATMEM. Both schemes require > a page table and a TLB. Hmm, this means there is at least 2MB worth of struct page on every node? Or do you have overlaps with other memory (I think you have) In that case you have to handle the overlap in change_page_attr() Also your "generic" vmemmap code doesn't look very generic, but rather x86 specific. I didn't think huge pages could be easily set up this way in many other architectures. And when you reserve virtual space somewhere you should update Documentation/x86_64/mm.txt. Also you didn't adjust the end of the vmalloc area so in theory vmalloc could run into your vmemmap. > Thus the SPARSEMEM becomes the most efficient way of handling > virt_to_page, pfn_to_page and friends for UP, SMP and NUMA. Do you have any benchmarks numbers to prove it? There seem to be a few benchmarks where the discontig virt_to_page is a problem (although I know ways to make it more efficient), and sparsemem is normally slower. Still some numbers would be good. -Andi -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-01 10:46 ` Andi Kleen @ 2007-04-02 15:37 ` Christoph Lameter 2007-04-02 15:44 ` Andi Kleen ` (2 more replies) 2007-04-02 15:44 ` Christoph Lameter 2007-04-02 16:18 ` Christoph Lameter 2 siblings, 3 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 15:37 UTC (permalink / raw) To: Andi Kleen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Sun, 1 Apr 2007, Andi Kleen wrote: > Hmm, this means there is at least 2MB worth of struct page on every node? > Or do you have overlaps with other memory (I think you have) > In that case you have to handle the overlap in change_page_attr() Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes with smaller amounts of memory? Note also that the default sparsemem section size is (include/asm-x86_64/sparsemem.h) #define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ 128MB .... So you currently cannot have smaller sections of memory anyways. > Also your "generic" vmemmap code doesn't look very generic, but > rather x86 specific. I didn't think huge pages could be easily > set up this way in many other architectures. We do this pmd special casing in other parts of the core VM. I have also a patch for IA64 that workks with this. > Do you have any benchmarks numbers to prove it? There seem to be a few > benchmarks where the discontig virt_to_page is a problem > (although I know ways to make it more efficient), and sparsemem > is normally slower. Still some numbers would be good. You want a benchmark to prove that the removal of memory references and code improves performance? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:37 ` Christoph Lameter @ 2007-04-02 15:44 ` Andi Kleen 2007-04-02 15:51 ` Martin J. Bligh 2007-04-02 15:54 ` Christoph Lameter 2007-04-02 20:13 ` Dave Hansen 2007-04-02 23:03 ` Bjorn Helgaas 2 siblings, 2 replies; 46+ messages in thread From: Andi Kleen @ 2007-04-02 15:44 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Monday 02 April 2007 17:37, Christoph Lameter wrote: > On Sun, 1 Apr 2007, Andi Kleen wrote: > > > Hmm, this means there is at least 2MB worth of struct page on every node? > > Or do you have overlaps with other memory (I think you have) > > In that case you have to handle the overlap in change_page_attr() > > Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes > with smaller amounts of memory? Yes the discontigmem minimum is 64MB and there are some setups (mostly with numa emulation) where you end up with nodes that small. BTW there is no guarantee the node size is a multiple of 128MB so you likely need to handle the overlap case. Otherwise we can get cache corruptions > Note also that the default sparsemem > section size is (include/asm-x86_64/sparsemem.h) > > #define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ > > 128MB .... > > So you currently cannot have smaller sections of memory anyways. Sparsemem is still quite experimental; discontigmem is the default on x86-64. > > > Also your "generic" vmemmap code doesn't look very generic, but > > rather x86 specific. I didn't think huge pages could be easily > > set up this way in many other architectures. > > We do this pmd special casing in other parts of the core VM. I have also a > patch for IA64 that workks with this. > > > Do you have any benchmarks numbers to prove it? There seem to be a few > > benchmarks where the discontig virt_to_page is a problem > > (although I know ways to make it more efficient), and sparsemem > > is normally slower. Still some numbers would be good. > > You want a benchmark to prove that the removal of memory references and > code improves performance? You're just moving them into MMU, not really removing it. And need more TLB entries. It might be faster or it might not. There are some unexpected issues, like most x86-64 CPUs have a quite small number of large TLBs so you can get thrashing etc. So numbers with TLB intensive workloads would be good. -Andi -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:44 ` Andi Kleen @ 2007-04-02 15:51 ` Martin J. Bligh 2007-04-02 15:54 ` Christoph Lameter 1 sibling, 0 replies; 46+ messages in thread From: Martin J. Bligh @ 2007-04-02 15:51 UTC (permalink / raw) To: Andi Kleen Cc: Christoph Lameter, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki, Andy Whitcroft him on this stuff ? Andi Kleen wrote: > On Monday 02 April 2007 17:37, Christoph Lameter wrote: >> On Sun, 1 Apr 2007, Andi Kleen wrote: >> >>> Hmm, this means there is at least 2MB worth of struct page on every node? >>> Or do you have overlaps with other memory (I think you have) >>> In that case you have to handle the overlap in change_page_attr() >> Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes >> with smaller amounts of memory? > > Yes the discontigmem minimum is 64MB and there are some setups > (mostly with numa emulation) where you end up with nodes that small. We're actually using numa emulation to do real (container) things with. However, 128MB is still pretty small for that ... and worst case, we just waste 1MB for a 64MB node, right? Which isn't beautiful, but doesn't seem like the end of the world for an obscure corner case. >>> Do you have any benchmarks numbers to prove it? There seem to be a few >>> benchmarks where the discontig virt_to_page is a problem >>> (although I know ways to make it more efficient), and sparsemem >>> is normally slower. Still some numbers would be good. >> You want a benchmark to prove that the removal of memory references and >> code improves performance? > > You're just moving them into MMU, not really removing it. And need more TLB entries. > It might be faster or it might not. There are some unexpected issues, like most x86-64 > CPUs have a quite small number of large TLBs so you can get thrashing etc. > > So numbers with TLB intensive workloads would be good. There's also the possibility it just doesn't make enough difference to affect a real benchmark ... M. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:44 ` Andi Kleen 2007-04-02 15:51 ` Martin J. Bligh @ 2007-04-02 15:54 ` Christoph Lameter 2007-04-02 17:14 ` Andi Kleen 2007-04-02 19:54 ` Dave Hansen 1 sibling, 2 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 15:54 UTC (permalink / raw) To: Andi Kleen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Andi Kleen wrote: > > Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes > > with smaller amounts of memory? > > Yes the discontigmem minimum is 64MB and there are some setups > (mostly with numa emulation) where you end up with nodes that small. Ok. Then I guess we cannot remove discontigmem. > BTW there is no guarantee the node size is a multiple of 128MB so > you likely need to handle the overlap case. Otherwise we can > get cache corruptions How does sparsemem handle that? > Sparsemem is still quite experimental; discontigmem is the default > on x86-64. Ok more fodder for preserving the choice. > > > Do you have any benchmarks numbers to prove it? There seem to be a few > > > benchmarks where the discontig virt_to_page is a problem > > > (although I know ways to make it more efficient), and sparsemem > > > is normally slower. Still some numbers would be good. > > > > You want a benchmark to prove that the removal of memory references and > > code improves performance? > > You're just moving them into MMU, not really removing it. And need more TLB entries. No no no. For the gazillions time: All of 1-1 mapped kernel memory on x86_64 needs a 2 MB page table entry. The virtual memmap uses the same. There are *no* additional TLBs used. > It might be faster or it might not. There are some unexpected issues, like most x86-64 > CPUs have a quite small number of large TLBs so you can get thrashing etc. > > So numbers with TLB intensive workloads would be good. You did not read the descriptions. Sigh. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:54 ` Christoph Lameter @ 2007-04-02 17:14 ` Andi Kleen 2007-04-02 17:34 ` Christoph Lameter 2007-04-02 19:54 ` Dave Hansen 1 sibling, 1 reply; 46+ messages in thread From: Andi Kleen @ 2007-04-02 17:14 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki > No no no. For the gazillions time: All of 1-1 mapped kernel memory on > x86_64 needs a 2 MB page table entry. The virtual memmap uses the same. > There are *no* additional TLBs used. But why do you reserve an own virtual area then if you claim to not use any additional mappings? -Andi -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 17:14 ` Andi Kleen @ 2007-04-02 17:34 ` Christoph Lameter 0 siblings, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 17:34 UTC (permalink / raw) To: Andi Kleen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Andi Kleen wrote: > > No no no. For the gazillions time: All of 1-1 mapped kernel memory on > > x86_64 needs a 2 MB page table entry. The virtual memmap uses the same. > > There are *no* additional TLBs used. > > But why do you reserve an own virtual area then if you claim to not use any > additional mappings? The 1-1 area using mappings for 2MB pages right? So it uses a virtual 1-1 area. It already has a virtual mapping. What we do for virtual memmap here is also use 2MB pages but order the pages a bit different so that they provide a linear memory map. So the number of TLBs in use stays the same. There are a few additional higher level page table pages that are needed to provide the alternate view that generates the linear mapping but that is just a couple of pages. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:54 ` Christoph Lameter 2007-04-02 17:14 ` Andi Kleen @ 2007-04-02 19:54 ` Dave Hansen 2007-04-02 20:11 ` Christoph Lameter 1 sibling, 1 reply; 46+ messages in thread From: Dave Hansen @ 2007-04-02 19:54 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 08:54 -0700, Christoph Lameter wrote: > > BTW there is no guarantee the node size is a multiple of 128MB so > > you likely need to handle the overlap case. Otherwise we can > > get cache corruptions > > How does sparsemem handle that? It doesn't. :) In practice, this situation never happens because we don't have any actual architectures that have any node boundaries on less than MAX_ORDER, and the section size is at least MAX_ORDER. If we *did* have this, then the page allocator would already be broken for these nodes. ;) So, this SPARSE_VIRTUAL does introduce a new dependency, which Andi calculated above. But, in reality, I don't think it's a big deal. Just to spell it out a bit more, if this: VMEMMAP_MAPPING_SIZE/sizeof(struct page) * PAGE_SIZE (where VMEMMAP_MAPPING_SIZE is PMD_SIZE in your case) is any larger than the granularity on which your NUMA nodes are divided, then you might have a problem with mem_map for one NUMA node getting allocated on another. It might be worth a comment, or at least some kind of WARN_ON(). Perhaps we can stick something in online_page() to check if: page_to_nid(page) == page_to_nid(virt_to_page(page)) -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 19:54 ` Dave Hansen @ 2007-04-02 20:11 ` Christoph Lameter 0 siblings, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 20:11 UTC (permalink / raw) To: Dave Hansen Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > MAX_ORDER, and the section size is at least MAX_ORDER. If we *did* have > this, then the page allocator would already be broken for these > nodes. ;) Ahh... Ok. > So, this SPARSE_VIRTUAL does introduce a new dependency, which Andi > calculated above. But, in reality, I don't think it's a big deal. Just > to spell it out a bit more, if this: > > VMEMMAP_MAPPING_SIZE/sizeof(struct page) * PAGE_SIZE > > (where VMEMMAP_MAPPING_SIZE is PMD_SIZE in your case) is any larger than > the granularity on which your NUMA nodes are divided, then you might > have a problem with mem_map for one NUMA node getting allocated on > another. This is only a problem if 1. We are not on NUMA emulation. In that case: Who cares. The SPARSEMEM sections make sure that the MAX_ORDER blocks do not overlap. 2. There is a hole less than 128 MB between the nodes. 3. The maximum overlap can then be theoretically less than 2M in terms of page structs. That is less than 128MB can overlap at the beginning of a node. Typically the start of a node gets used for allocation system control areas. I.e. node data, vmemmap 2M blocks etc. For those we only use the page structs during bootstrap. They are not performance critical. We can just ignore the problem for those. In order for this to become a problem the overlap would need to more than the management data at the front of a node. The larger the zone is the more 2M blocks will be allocated from the beginning a node and the less we can actually get into this situation. If this is an actual problem then we could take out this particular 2M page and replace it with single 4K pages that can be individually placed. Yaw.... Too complex. I think we can ignore this. The only problem could be reduced performance accessing page structs of some small portion of a node. > It might be worth a comment, or at least some kind of WARN_ON(). > Perhaps we can stick something in online_page() to check if: > > page_to_nid(page) == page_to_nid(virt_to_page(page)) Could do that but the check is going to be too agressive. Check would have to be done after all control information has been allocated. WARN_ON would be sufficient since this is not going to impact functionality. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:37 ` Christoph Lameter 2007-04-02 15:44 ` Andi Kleen @ 2007-04-02 20:13 ` Dave Hansen 2007-04-02 20:30 ` Christoph Lameter 2007-04-02 23:03 ` Bjorn Helgaas 2 siblings, 1 reply; 46+ messages in thread From: Dave Hansen @ 2007-04-02 20:13 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 08:37 -0700, Christoph Lameter wrote: > You want a benchmark to prove that the removal of memory references and > code improves performance? Yes, please. ;) I completely agree, it looks like it should be faster. The code certainly has potential benefits. But, to add this neato, apparently more performant feature, we unfortunately have to add code. Adding the code has a cost: code maintenance. This isn't a runtime cost, but it is a real, honest to goodness tradeoff. So, let's get some kind of concrete idea what the tradeoffs are. Is it, 400 lines of code gets us a 10% performance boost across the board, or that 400,000 lines gets us 0.1% on one specialized benchmark? BTW, I like the patches. Very nice and clean. -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 20:13 ` Dave Hansen @ 2007-04-02 20:30 ` Christoph Lameter 2007-04-02 20:38 ` Martin Bligh 2007-04-02 21:08 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen 0 siblings, 2 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 20:30 UTC (permalink / raw) To: Dave Hansen Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > I completely agree, it looks like it should be faster. The code > certainly has potential benefits. But, to add this neato, apparently > more performant feature, we unfortunately have to add code. Adding the > code has a cost: code maintenance. This isn't a runtime cost, but it is > a real, honest to goodness tradeoff. Its just the opposite. The vmemmap code is so efficient that we can remove lots of other code and gops of these alternate implementations. On x86_64 its even superior to FLATMEM since FLATMEM still needs a memory reference for the mem_map area. So if we make SPARSE standard for all configurations then there is no need anymore for FLATMEM DISCONTIG etc etc. Can we not cleanup all this mess? Get rid of all the gazillions of #ifdefs please? This would ease code maintenance significantly. I hate having to constantly navigate my way through all the alternatives. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 20:30 ` Christoph Lameter @ 2007-04-02 20:38 ` Martin Bligh 2007-04-02 20:51 ` Christoph Lameter 2007-04-02 21:08 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen 1 sibling, 1 reply; 46+ messages in thread From: Martin Bligh @ 2007-04-02 20:38 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki, Andy Whitcroft Christoph Lameter wrote: > On Mon, 2 Apr 2007, Dave Hansen wrote: > >> I completely agree, it looks like it should be faster. The code >> certainly has potential benefits. But, to add this neato, apparently >> more performant feature, we unfortunately have to add code. Adding the >> code has a cost: code maintenance. This isn't a runtime cost, but it is >> a real, honest to goodness tradeoff. > > Its just the opposite. The vmemmap code is so efficient that we can remove > lots of other code and gops of these alternate implementations. On x86_64 > its even superior to FLATMEM since FLATMEM still needs a memory reference > for the mem_map area. So if we make SPARSE standard for all > configurations then there is no need anymore for FLATMEM DISCONTIG etc > etc. Can we not cleanup all this mess? Get rid of all the gazillions > of #ifdefs please? This would ease code maintenance significantly. I hate > having to constantly navigate my way through all the alternatives. The original plan when this was first merged was pretty much that - for sparsemem to replace discontigmem once it was well tested. Seems to have got stalled halfway through ;-( Not sure we'll get away with replacing flatmem for all arches, but we could at least get rid of discontigmem, it seems. M. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 20:38 ` Martin Bligh @ 2007-04-02 20:51 ` Christoph Lameter 2007-04-05 11:50 ` Andy Whitcroft 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 20:51 UTC (permalink / raw) To: Martin Bligh Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki, Andy Whitcroft On Mon, 2 Apr 2007, Martin Bligh wrote: > > Its just the opposite. The vmemmap code is so efficient that we can remove > > lots of other code and gops of these alternate implementations. On x86_64 > > its even superior to FLATMEM since FLATMEM still needs a memory reference > > for the mem_map area. So if we make SPARSE standard for all configurations > > then there is no need anymore for FLATMEM DISCONTIG etc etc. Can we not > > cleanup all this mess? Get rid of all the gazillions of #ifdefs please? This > > would ease code maintenance significantly. I hate having to constantly > > navigate my way through all the alternatives. > > The original plan when this was first merged was pretty much that - > for sparsemem to replace discontigmem once it was well tested. Seems > to have got stalled halfway through ;-( But you made big boboo in SPARSEMEM. Virtual memmap is a textbook case that was not covered. Instead this horrible stuff that involves calling functions in VM primitives. We could have been there years ago... > Not sure we'll get away with replacing flatmem for all arches, but > we could at least get rid of discontigmem, it seems. I think we could start with x86_64 and ia64. Both will work fine with SPARSE VIRTUAL (and SGIs concerns about performance are addressed) and we could remove the other alternatives. That is going to throw out lots of stuff. Then proceed to other arches Could the SPARSEMEM folks take this over this patch? You have more resources and I am all alone on this. I will post another patchset today that also includes an IA64 implementation. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 20:51 ` Christoph Lameter @ 2007-04-05 11:50 ` Andy Whitcroft 2007-04-05 18:27 ` Christoph Lameter 0 siblings, 1 reply; 46+ messages in thread From: Andy Whitcroft @ 2007-04-05 11:50 UTC (permalink / raw) To: Christoph Lameter Cc: Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki Christoph Lameter wrote: > On Mon, 2 Apr 2007, Martin Bligh wrote: > >>> Its just the opposite. The vmemmap code is so efficient that we can remove >>> lots of other code and gops of these alternate implementations. On x86_64 >>> its even superior to FLATMEM since FLATMEM still needs a memory reference >>> for the mem_map area. So if we make SPARSE standard for all configurations >>> then there is no need anymore for FLATMEM DISCONTIG etc etc. Can we not >>> cleanup all this mess? Get rid of all the gazillions of #ifdefs please? This >>> would ease code maintenance significantly. I hate having to constantly >>> navigate my way through all the alternatives. >> The original plan when this was first merged was pretty much that - >> for sparsemem to replace discontigmem once it was well tested. Seems >> to have got stalled halfway through ;-( > > But you made big boboo in SPARSEMEM. Virtual memmap is a textbook case > that was not covered. Instead this horrible stuff that involves calling > functions in VM primitives. We could have been there years ago... > >> Not sure we'll get away with replacing flatmem for all arches, but >> we could at least get rid of discontigmem, it seems. > > I think we could start with x86_64 and ia64. Both will work fine with > SPARSE VIRTUAL (and SGIs concerns about performance are addressed) and we > could remove the other alternatives. That is going to throw out lots of > stuff. Then proceed to other arches > > Could the SPARSEMEM folks take this over this patch? You have more > resources and I am all alone on this. I will post another patchset today > that also includes an IA64 implementation. That would be me. I have been offline writing for OLS and did not get to respond before this. When we first saw these patches the reaction was a general positive despite skepticism on the general utility of vmemmmap. The patches appear to provide an architecture neutral implementation. At that time s390 was just starting to use vmemmap bringing 2x implementations into the kernel. Now we have three. Clearly, even if vemmemap was a net performance loss having only one implementation has to be a good thing for maintainability/coverage. Without some benchmarking and some general testing it certainly is premature to be removing the other memory models, but for sure that was the original plan. The original assumption that as time went on the other models would wither on the vine, this has only happened on powerpc so far. So I will go and: 1) talk to s390 and find out if they can use this same form, as one implementation of vmemmap only should be the goal, and 2) take the current patches and try and get some benchmarks for them against other memory models Christoph if you could let us know which benchmarks you are seeing gains with that would be a help. -apw -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-05 11:50 ` Andy Whitcroft @ 2007-04-05 18:27 ` Christoph Lameter 2007-04-07 22:06 ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-05 18:27 UTC (permalink / raw) To: Andy Whitcroft Cc: Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Thu, 5 Apr 2007, Andy Whitcroft wrote: > Christoph if you could let us know which benchmarks you are seeing gains > with that would be a help. You saw the numbers that Ken got with the pipe test right? Then there are some minor improvements if you run AIM7. I got into this because I saw the code that was generated by sparsemem and discontig. Sparsemem was doing a series of lookups and so did pure discontig on x86_64. Both were really too complex for inlining. Discontig/Vmemmap on IA64 was sort of cleaner but there was still a reference to a variable plus it was creating the needs for lots of TLBs. These VM primitives are pretty critical and the code for those needs to be minimal and inline. So I removed the variable ref from discontig/Vmemmap and reduced the wokr to be done to the simple formula (that I also found in textbooks on memmap management). Avoided the TLB pressure by using the 1-1 mappings page size. I could get real some performance numbers for this by sticking in a performance counter before and after virt_to_page and page_address. But I am pretty sure about the result just looking at the code. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) 2007-04-05 18:27 ` Christoph Lameter @ 2007-04-07 22:06 ` Christoph Lameter 2007-04-07 22:18 ` Andi Kleen 2007-04-09 16:40 ` William Lee Irwin III 0 siblings, 2 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-07 22:06 UTC (permalink / raw) To: Andy Whitcroft Cc: Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Thu, 5 Apr 2007, Christoph Lameter wrote: > On Thu, 5 Apr 2007, Andy Whitcroft wrote: > > Christoph if you could let us know which benchmarks you are seeing gains > > with that would be a help. > > You saw the numbers that Ken got with the pipe test right? > > Then there are some minor improvements if you run AIM7. > > I could get real some performance numbers for this by sticking in a > performance counter before and after virt_to_page and page_address. But I > am pretty sure about the result just looking at the code. Ok. Since I keep being asked, I stuck a performance counter in kfree on x86_64 to see the difference in performance: Results: x86_64 boot with virtual memmap Format: #events totaltime (min/avg/max) kfree_virt_to_page 598430 5.6ms(3ns/9ns/322ns) x86_64 boot regular sparsemem kfree_virt_to_page 596360 10.5ms(4ns/18ns/28.7us) On average sparsemem virtual takes half the time than of sparsemem. Note that the maximum time for regular sparsemem is way higher than sparse virtual. This reflects the possibility that regular sparsemem may once in a while have to deal with a cache miss whereas sparsemem virtual has no memory reference. Thus the numbers stay consistently low. Patch that was used to get these results (this is not very clean sorry but it should be enough to verify the results): Simple Performance Counters This patch allows the use of simple performance counters to measure time intervals in the kernel source code. This allows a detailed analysis of the time spend and the amount of data processed in specific code sections of the kernel. Time is measured using the cycle counter (TSC on IA32, ITC on IA64) which has a very low latency. To use add #include <linux/perf.h> to the header of the file where the measurement needs to take place. Then add the folowing to the code: To declare a time stamp do struct pc pc; To mark the beginning of the time measurement do pc_start(&pc, <counter>) (If measurement from the beginning of a function is desired one may use INIT_PC(xx) instead). To mark the end of the time frame do: pc_stop(&pc); or if the amount of data transferred needs to be measured as well: pc_throughput(&pc, number-of-bytes); The measurements will show up in /proc/perf/all. Processor specific statistics may be obtained via /proc/perf/<nr-of-processor>. Writing to /proc/perf/reset will reset all counters. F.e. echo >/proc/perf/reset The first counter is the number of times that the time measurement was performed. (+ xx) is the number of samples that were thrown away since the processor on which the process is running changed. Cycle counters may not be consistent across different processors. Then follows the sum of the time spend in the code segment followed in parentheses by the minimum / average / maximum time spent there. The second block are the sizes of data processed. Signed-off-by: Christoph Lameter <clameter@sgi.com> Index: linux-2.6.21-rc5-mm4/kernel/Makefile =================================================================== --- linux-2.6.21-rc5-mm4.orig/kernel/Makefile 2007-04-07 14:04:32.000000000 -0700 +++ linux-2.6.21-rc5-mm4/kernel/Makefile 2007-04-07 14:05:12.000000000 -0700 @@ -55,6 +55,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_UTRACE) += utrace.o obj-$(CONFIG_PTRACE) += ptrace.o +obj-y += perf.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is Index: linux-2.6.21-rc5-mm4/include/linux/perf.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-rc5-mm4/include/linux/perf.h 2007-04-07 14:05:12.000000000 -0700 @@ -0,0 +1,49 @@ +/* + * Performance Counters and Measurement macros + * (C) 2005 Silicon Graphics Incorporated + * by Christoph Lameter <clameter@sgi.com>, April 2005 + * + * Counters are calculated using the cycle counter. If a process + * is migrated to another cpu during the measurement then the measurement + * is invalid. + * + * We cannot disable preemption during measurement since that may interfere + * with other things in the kernel and limit the usefulness of the counters. + */ + +enum pc_item { + PC_KFREE_VIRT_TO_PAGE, + PC_PTE_ALLOC, + PC_PTE_FREE, + PC_PMD_ALLOC, + PC_PMD_FREE, + PC_PUD_ALLOC, + PC_PUD_FREE, + PC_PGD_ALLOC, + PC_PGD_FREE, + NR_PC_ITEMS +}; + +/* + * Information about the start of the measurement + */ +struct pc { + unsigned long time; + int processor; + enum pc_item item; +}; + +static inline void pc_start(struct pc *pc, enum pc_item nr) +{ + pc->item = nr; + pc->processor = smp_processor_id(); + pc->time = get_cycles(); +} + +#define INIT_PC(__var, __item) struct pc __var = \ + { get_cycles(), smp_processor_id(), __item } + +void pc_throughput(struct pc *pc, unsigned long bytes); + +#define pc_stop(__pc) pc_throughput(__pc, 0) + Index: linux-2.6.21-rc5-mm4/kernel/perf.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-rc5-mm4/kernel/perf.c 2007-04-07 14:05:12.000000000 -0700 @@ -0,0 +1,345 @@ +/* + * Simple Performance Counter subsystem + * + * (C) 2007 sgi. + * + * April 2007, Christoph Lameter <clameter@sgi.com> + */ + +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/cpumask.h> +#include <linux/perf.h> +/* For the hash function */ +#include <linux/dcache.h> + +#ifdef CONFIG_IA64 +#define cycles_to_ns(x) (((x) * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT) +#elif defined(CONFIG_X86_64) +#define cycles_to_ns(x) cycles_2_ns(x) +#else +#error "cycles_to_ns not defined for this architecture" +#endif + +const char *var_id[NR_PC_ITEMS] = { + "kfree_virt_to_page", + "pte_alloc", + "pte_free", + "pmd_alloc", + "pmd_free", + "pud_alloc", + "pud_free", + "pgd_alloc", + "pgd_free" +}; + +struct perf_counter { + u32 events; + u32 mintime; + u32 maxtime; + u32 minbytes; + u32 maxbytes; + u32 skipped; + u64 time; + u64 bytes; +}; + +static DEFINE_PER_CPU(struct perf_counter, perf_counters)[NR_PC_ITEMS]; + +void pc_throughput(struct pc *pc, unsigned long bytes) +{ + unsigned long time = get_cycles(); + unsigned long ns; + int cpu = smp_processor_id(); + struct perf_counter *p = &get_cpu_var(perf_counters)[pc->item]; + + if (unlikely(pc->item >= NR_PC_ITEMS)) { + printk(KERN_CRIT "pc_throughput: item (%d) out of range\n", + pc->item); + dump_stack(); + goto out; + } + + if (unlikely(pc->processor != cpu)) { + /* On different processor. TSC measurement not possible. */ + p->skipped++; + goto out; + } + + ns = cycles_to_ns(time - pc->time); + if (unlikely(ns > (1UL << (BITS_PER_LONG - 2)))) { + printk(KERN_ERR "perfcount %s: invalid time difference.\n", + var_id[pc->item]); + goto out; + } + + p->time += ns; + p->events++; + + if (ns > p->maxtime) + p->maxtime = ns; + + if (p->mintime == 0 || ns < p->mintime) + p->mintime = ns; + + if (bytes) { + p->bytes += bytes; + if (bytes > p->maxbytes) + p->maxbytes = bytes; + if (p->minbytes == 0 || bytes < p->minbytes) + p->minbytes = bytes; + } +out: + put_cpu_var(); + return; +} +EXPORT_SYMBOL(pc_throughput); + +static void reset_perfcount_item(struct perf_counter *c) +{ + c->events =0; + c->time =0; + c->maxtime =0; + c->mintime =0; + c->bytes =0; + c->minbytes =0; + c->maxbytes =0; +} + +static void perfcount_reset(void) { + int cpu; + enum pc_item i; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_PC_ITEMS; i++) + reset_perfcount_item( + &per_cpu(perf_counters, cpu)[i]); +} + +struct unit { + unsigned int n; + const char * s; +}; + +static const struct unit event_units[] = { + { 1000, "" }, + { 1000, "K" }, + { 1000, "M" }, + { 1000, "G" }, + { 1000, "T" }, + { 1000, "P" }, + { 1000, "XX" }, +}; + + +static const struct unit time_units[] = { + { 1000, "ns" }, + { 1000, "us" }, + { 1000, "ms" }, + { 60, "s" }, + { 60, "m" }, + { 24, "h" }, + { 365, "d" }, + { 1000, "y" }, +}; + +static const struct unit byte_units[] = { + { 1000, "b" }, + { 1000, "kb" }, + { 1000, "mb" }, + { 1000, "gb" }, + { 1000, "tb" }, + { 1000, "pb" }, + { 1000, "xb" } +}; + +/* Print a value using the given array of units and scale it properly */ +static void pval(struct seq_file *s, unsigned long x, const struct unit *u) +{ + unsigned n = 0; + unsigned rem = 0; + unsigned last_divisor = 0; + + while (x >= u[n].n) { + last_divisor = u[n].n; + rem = x % last_divisor; + x = x / last_divisor; + n++; + } + + if (last_divisor) + rem = (rem * 10 + last_divisor / 2) / last_divisor; + else + rem = 0; + + /* + * Rounding may have resulted in the need to go + * to the next number + */ + if (rem == 10) { + x++; + rem = 0; + }; + + seq_printf(s, "%lu", x); + if (rem) { + seq_putc(s, '.'); + seq_putc(s, '0' + rem); + } + seq_puts(s, u[n].s); +} + +/* Print a set of statistical values in the form sum(max/avg/min) */ +static void pc_print(struct seq_file *s, const struct unit *u, + unsigned long count, unsigned long sum, + unsigned long min, unsigned long max) +{ + pval(s, sum, u); + seq_putc(s,'('); + pval(s, min, u); + seq_putc(s,'/'); + if (count) + pval(s, (sum + count / 2 ) / count, u); + else + pval(s, 0, u); + seq_putc(s,'/'); + pval(s, max, u); + seq_putc(s,')'); +} + + +static int perf_show(struct seq_file *s, void *v) +{ + int cpu = (unsigned long)s->private; + enum pc_item counter = (unsigned long)v - 1; + struct perf_counter summary, *x; + + if (cpu >= 0) + x = &per_cpu(perf_counters, cpu)[counter]; + else { + memcpy(&summary, &per_cpu(perf_counters, 0)[counter], + sizeof(summary)); + for_each_online_cpu(cpu) { + struct perf_counter *c = + &per_cpu(perf_counters, 0)[counter]; + + summary.events += c->events; + summary.skipped += c->skipped; + summary.time += c->time; + summary.bytes += c->bytes; + + if (summary.maxtime < c->maxtime) + summary.maxtime = c->maxtime; + + if (summary.mintime == 0 || + (c->mintime != 0 && + summary.mintime > c->mintime)) + summary.mintime = c->mintime; + + if (summary.maxbytes < c->maxbytes) + summary.maxbytes = c->maxbytes; + + if (summary.minbytes == 0 || + (c->minbytes != 0 && + summary.minbytes > c->minbytes)) + summary.minbytes = c->minbytes; + + } + x = &summary; + } + + seq_printf(s, "%-20s %10u ", var_id[counter], x->events); + if (x->skipped) + seq_printf(s, "(+%3u) ", x->skipped); + pc_print(s, time_units, x->events, x->time, x->mintime, x->maxtime); + if (x->bytes) { + seq_putc(s,' '); + pc_print(s, byte_units, x->events, x->bytes, x->minbytes, x->maxbytes); + } + seq_putc(s, '\n'); + return 0; +} + +static void *perf_start(struct seq_file *m, loff_t *pos) +{ + return (*pos < NR_PC_ITEMS) ? (void *)(*pos +1) : NULL; +} + +static void *perf_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return perf_start(m, pos); +} + +static void perf_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations perf_data_ops = { + .start = perf_start, + .next = perf_next, + .stop = perf_stop, + .show = perf_show, +}; + +static int perf_data_open(struct inode *inode, struct file *file) +{ + int res; + + res = seq_open(file, &perf_data_ops); + if (!res) + ((struct seq_file *)file->private_data)->private = PDE(inode)->data; + + return res; +} + +static struct file_operations perf_data_fops = { + .open = perf_data_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int perf_reset_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + perfcount_reset(); + return count; +} + +static __init int init_perfcounter(void) { + int cpu; + + struct proc_dir_entry *proc_perf, *perf_reset, *perf_all; + + proc_perf = proc_mkdir("perf", NULL); + if (!proc_perf) + return -ENOMEM; + + perf_reset = create_proc_entry("reset", S_IWUGO, proc_perf); + perf_reset->write_proc = perf_reset_write; + + perf_all = create_proc_entry("all", S_IRUGO, proc_perf); + perf_all->proc_fops = &perf_data_fops; + perf_all->data = (void *)-1; + + for_each_possible_cpu(cpu) { + char name[20]; + struct proc_dir_entry *p; + + sprintf(name, "%d", cpu); + p = create_proc_entry(name, S_IRUGO, proc_perf); + + p->proc_fops = &perf_data_fops; + p->data = (void *)(unsigned long)cpu; + } + + perfcount_reset(); + return 0; +} + +__initcall(init_perfcounter); + Index: linux-2.6.21-rc5-mm4/arch/x86_64/kernel/tsc.c =================================================================== --- linux-2.6.21-rc5-mm4.orig/arch/x86_64/kernel/tsc.c 2007-04-07 14:04:32.000000000 -0700 +++ linux-2.6.21-rc5-mm4/arch/x86_64/kernel/tsc.c 2007-04-07 14:05:12.000000000 -0700 @@ -23,7 +23,7 @@ cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; } -static unsigned long long cycles_2_ns(unsigned long long cyc) +unsigned long long cycles_2_ns(unsigned long long cyc) { return (cyc * cyc2ns_scale) >> NS_SCALE; } Index: linux-2.6.21-rc5-mm4/include/asm-x86_64/timex.h =================================================================== --- linux-2.6.21-rc5-mm4.orig/include/asm-x86_64/timex.h 2007-04-07 14:04:32.000000000 -0700 +++ linux-2.6.21-rc5-mm4/include/asm-x86_64/timex.h 2007-04-07 14:05:12.000000000 -0700 @@ -29,4 +29,5 @@ extern void mark_tsc_unstable(void); extern void set_cyc2ns_scale(unsigned long khz); +unsigned long long cycles_2_ns(unsigned long long cyc); #endif Index: linux-2.6.21-rc5-mm4/mm/slub.c =================================================================== --- linux-2.6.21-rc5-mm4.orig/mm/slub.c 2007-04-07 14:05:11.000000000 -0700 +++ linux-2.6.21-rc5-mm4/mm/slub.c 2007-04-07 14:05:15.000000000 -0700 @@ -20,6 +20,7 @@ #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/kallsyms.h> +#include <linux/perf.h> /* * Lock order: @@ -1299,8 +1302,11 @@ void kmem_cache_free(struct kmem_cache *s, void *x) { struct page * page; + struct pc pc; + pc_start(&pc, PC_KFREE_VIRT_TO_PAGE); page = virt_to_page(x); + pc_stop(&pc); page = compound_head(page); @@ -1917,11 +1923,16 @@ { struct kmem_cache *s; struct page * page; + struct pc pc; if (!x) return; - page = compound_head(virt_to_page(x)); + pc_start(&pc, PC_KFREE_VIRT_TO_PAGE); + page = virt_to_page(x); + pc_stop(&pc); + + page = compound_head(page); s = page->slab; Index: linux-2.6.21-rc5-mm4/kernel/fork.c =================================================================== --- linux-2.6.21-rc5-mm4.orig/kernel/fork.c 2007-04-07 14:04:32.000000000 -0700 +++ linux-2.6.21-rc5-mm4/kernel/fork.c 2007-04-07 14:05:12.000000000 -0700 @@ -58,6 +58,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <linux/perf.h> + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -304,7 +306,10 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) { + INIT_PC(pc, PC_PGD_ALLOC); + mm->pgd = pgd_alloc(mm); + pc_stop(&pc); if (unlikely(!mm->pgd)) return -ENOMEM; return 0; @@ -312,7 +317,9 @@ static inline void mm_free_pgd(struct mm_struct * mm) { + INIT_PC(pc, PC_PGD_FREE); pgd_free(mm->pgd); + pc_stop(&pc); } #else #define dup_mmap(mm, oldmm) (0) Index: linux-2.6.21-rc5-mm4/mm/memory.c =================================================================== --- linux-2.6.21-rc5-mm4.orig/mm/memory.c 2007-04-07 14:04:32.000000000 -0700 +++ linux-2.6.21-rc5-mm4/mm/memory.c 2007-04-07 14:05:12.000000000 -0700 @@ -60,6 +60,8 @@ #include <linux/swapops.h> #include <linux/elf.h> +#include <linux/perf.h> + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -302,15 +304,22 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { + INIT_PC(pc, PC_PTE_ALLOC); struct page *new = pte_alloc_one(mm, address); + + pc_stop(&pc); if (!new) return -ENOMEM; pte_lock_init(new); spin_lock(&mm->page_table_lock); if (pmd_present(*pmd)) { /* Another has populated it */ + struct pc pc; + pte_lock_deinit(new); + pc_start(&pc, PC_PTE_FREE); pte_free(new); + pc_stop(&pc); } else { mm->nr_ptes++; inc_zone_page_state(new, NR_PAGETABLE); @@ -2509,14 +2518,20 @@ */ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { + INIT_PC(pc, PC_PUD_ALLOC); pud_t *new = pud_alloc_one(mm, address); + + pc_stop(&pc); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ + { + INIT_PC(pc, PC_PUD_FREE); pud_free(new); - else + pc_stop(&pc); + } else pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); return 0; @@ -2530,20 +2545,29 @@ */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { + INIT_PC(pc, PC_PMD_ALLOC); pmd_t *new = pmd_alloc_one(mm, address); + + pc_stop(&pc); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) /* Another has populated it */ + { + INIT_PC(pc, PC_PMD_FREE); pmd_free(new); - else + pc_stop(&pc); + } else pud_populate(mm, pud, new); #else if (pgd_present(*pud)) /* Another has populated it */ + { + INIT_PC(pc, PC_PMD_FREE); pmd_free(new); - else + pc_stop(&pc); + } else pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(&mm->page_table_lock); Index: linux-2.6.21-rc5-mm4/mm/slab.c =================================================================== --- linux-2.6.21-rc5-mm4.orig/mm/slab.c 2007-04-07 14:22:11.000000000 -0700 +++ linux-2.6.21-rc5-mm4/mm/slab.c 2007-04-07 14:24:14.000000000 -0700 @@ -621,7 +621,10 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) { + INIT_PC(PC_KFREE_VIRT_TO_PAGE); struct page *page = virt_to_page(obj); + + pc_stop(&pc); return page_get_cache(page); } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) 2007-04-07 22:06 ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter @ 2007-04-07 22:18 ` Andi Kleen 2007-04-09 16:40 ` William Lee Irwin III 1 sibling, 0 replies; 46+ messages in thread From: Andi Kleen @ 2007-04-07 22:18 UTC (permalink / raw) To: Christoph Lameter Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Sunday 08 April 2007 00:06:13 Christoph Lameter wrote: > Results: > > x86_64 boot with virtual memmap > > Format: #events totaltime (min/avg/max) > > kfree_virt_to_page 598430 5.6ms(3ns/9ns/322ns) > > x86_64 boot regular sparsemem > > kfree_virt_to_page 596360 10.5ms(4ns/18ns/28.7us) > > > On average sparsemem virtual takes half the time than of sparsemem. Nice. But on what workloads? Anyways it looks promising. I hope we can just replace old style sparsemem support with this for x86-64. > Time is measured using the cycle counter (TSC on IA32, ITC on IA64) which has > a very low latency. Sorry that triggered my usual RDTSC rant... Not on NetBurst (hundred of cycles) And on the others (C2,K8) it is a bit dangerous to measure short code blocks because RDTSC is not guaranteed ordered with the surrounding instructions. -Andi -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) 2007-04-07 22:06 ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter 2007-04-07 22:18 ` Andi Kleen @ 2007-04-09 16:40 ` William Lee Irwin III 2007-04-09 17:16 ` Christoph Lameter 1 sibling, 1 reply; 46+ messages in thread From: William Lee Irwin III @ 2007-04-09 16:40 UTC (permalink / raw) To: Christoph Lameter Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Sat, Apr 07, 2007 at 03:06:13PM -0700, Christoph Lameter wrote: > +/* > + * Performance Counters and Measurement macros > + * (C) 2005 Silicon Graphics Incorporated > + * by Christoph Lameter <clameter@sgi.com>, April 2005 > + * > + * Counters are calculated using the cycle counter. If a process > + * is migrated to another cpu during the measurement then the measurement > + * is invalid. > + * > + * We cannot disable preemption during measurement since that may interfere > + * with other things in the kernel and limit the usefulness of the counters. > + */ Whatever's going on with the rest of this, I really like this instrumentation patch. It may be worthwhile to allow pc_start() to be overridden so things like performance counter MSR's are usable, but the framework looks very useful. -- wli -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) 2007-04-09 16:40 ` William Lee Irwin III @ 2007-04-09 17:16 ` Christoph Lameter 2007-04-09 18:20 ` William Lee Irwin III 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-09 17:16 UTC (permalink / raw) To: William Lee Irwin III Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Mon, 9 Apr 2007, William Lee Irwin III wrote: > Whatever's going on with the rest of this, I really like this > instrumentation patch. It may be worthwhile to allow pc_start() to be > overridden so things like performance counter MSR's are usable, but > the framework looks very useful. Yeah. I also did some measurements on quicklists on x86_64 and it seems that caching page table pages is also useful: no quicklist pte_alloc 1569048 4.3s(401ns/2.7us/179.7us) pmd_alloc 780988 2.1s(337ns/2.7us/86.1us) pud_alloc 780072 2.2s(424ns/2.8us/300.6us) pgd_alloc 260022 1s(920ns/4us/263.1us) quicklist: pte_alloc 452436 573.4ms(8ns/1.3us/121.1us) pmd_alloc 196204 174.5ms(7ns/889ns/46.1us) pud_alloc 195688 172.4ms(7ns/881ns/151.3us) pgd_alloc 65228 9.8ms(8ns/150ns/6.1us) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) 2007-04-09 17:16 ` Christoph Lameter @ 2007-04-09 18:20 ` William Lee Irwin III 0 siblings, 0 replies; 46+ messages in thread From: William Lee Irwin III @ 2007-04-09 18:20 UTC (permalink / raw) To: Christoph Lameter Cc: Andy Whitcroft, Martin Bligh, Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Mon, 9 Apr 2007, William Lee Irwin III wrote: >> Whatever's going on with the rest of this, I really like this >> instrumentation patch. It may be worthwhile to allow pc_start() to be >> overridden so things like performance counter MSR's are usable, but >> the framework looks very useful. On Mon, Apr 09, 2007 at 10:16:06AM -0700, Christoph Lameter wrote: > Yeah. I also did some measurements on quicklists on x86_64 and it seems > that caching page table pages is also useful: [...] Sadly these timings will not address the arguments made on behalf of eager zeroing, which are essentially that "precharging the cache" will benefit fork(). I personally still find them meaningful. I think a demonstration that will deal with more of others' concerns might be instrumenting fault latency after a fresh execve() and comparing fork() timings. This is clearly awkward given the scheduling opportunities in these areas, but a virtual time measurement may suffice to cope with those. The fault latency after a fresh execve() is particularly relevant because it's a scenario where incremental pagetable construction occurs in "real life." It's actually less common for processes to merely fork() than to fork() then execve() and then perform most of their work in the context set up in execve(). The pagetables set up for fork() are most commonly short-lived and the utility of caching them or even constructing them so questionable that pagetable sharing and other methods of avoiding the copy are quite plausible, though perhaps in need of some heuristics to avoid new faults for the purposes of merely copying pagetables where possible (AIUI such are considered or implemented in pagetable sharing patches; of course, Dave McCracken has far more detailed knowledge of the performance considerations there). I daresay it's highly dubious to consider pagetable construction for fork() in isolation when the common case is to almost immediately flush all those pagetables down the toilet via execve(). Basically, if we can establish that (1) pagetable caching doesn't hurt fork() or maybe even speeds it up then (2) it speeds up post-execve() faults, then things look good. Raw timings on the pagetable allocation primitives are unfortunately too micro to adequately make our case. -- wli -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 20:30 ` Christoph Lameter 2007-04-02 20:38 ` Martin Bligh @ 2007-04-02 21:08 ` Dave Hansen 2007-04-02 21:28 ` Christoph Lameter 1 sibling, 1 reply; 46+ messages in thread From: Dave Hansen @ 2007-04-02 21:08 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 13:30 -0700, Christoph Lameter wrote: > On Mon, 2 Apr 2007, Dave Hansen wrote: > > I completely agree, it looks like it should be faster. The code > > certainly has potential benefits. But, to add this neato, apparently > > more performant feature, we unfortunately have to add code. Adding the > > code has a cost: code maintenance. This isn't a runtime cost, but it is > > a real, honest to goodness tradeoff. > > Its just the opposite. The vmemmap code is so efficient that we can remove > lots of other code and gops of these alternate implementations. We do want to make sure that there isn't anyone relying on these. Are you thinking of simple sparsemem vs. extreme vs. sparsemem vmemmap? Or, are you thinking of sparsemem vs. discontig? > On x86_64 > its even superior to FLATMEM since FLATMEM still needs a memory reference > for the mem_map area. So if we make SPARSE standard for all > configurations then there is no need anymore for FLATMEM DISCONTIG etc > etc. Can we not cleanup all this mess? Get rid of all the gazillions > of #ifdefs please? This would ease code maintenance significantly. I hate > having to constantly navigate my way through all the alternatives. Amen, brother. I'd love to see DISCONTIG die, with sufficient testing, of course. Andi, do you have any ideas on how to get sparsemem out of the 'experimental' phase? I have noticed before that sparsemem should be able to cover the flatmem case if we make MAX_PHYSMEM_BITS == SECTION_SIZE_BITS and massage from there. -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 21:08 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen @ 2007-04-02 21:28 ` Christoph Lameter 2007-04-02 21:43 ` Martin Bligh ` (2 more replies) 0 siblings, 3 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 21:28 UTC (permalink / raw) To: Dave Hansen Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > On Mon, 2007-04-02 at 13:30 -0700, Christoph Lameter wrote: > > On Mon, 2 Apr 2007, Dave Hansen wrote: > > > I completely agree, it looks like it should be faster. The code > > > certainly has potential benefits. But, to add this neato, apparently > > > more performant feature, we unfortunately have to add code. Adding the > > > code has a cost: code maintenance. This isn't a runtime cost, but it is > > > a real, honest to goodness tradeoff. > > > > Its just the opposite. The vmemmap code is so efficient that we can remove > > lots of other code and gops of these alternate implementations. > > We do want to make sure that there isn't anyone relying on these. Are > you thinking of simple sparsemem vs. extreme vs. sparsemem vmemmap? Or, > are you thinking of sparsemem vs. discontig? I am thinking sparsemem default and then get rid discontig, flatmem etc. On many platforms this will work. Flatmem for embedded could just be a variation on sparse_virtual. > Amen, brother. I'd love to see DISCONTIG die, with sufficient testing, > of course. Andi, do you have any ideas on how to get sparsemem out of > the 'experimental' phase? Note that these arguments on DISCONTIG are flame bait for many SGIers. We usually see this as an attack on DISCONTIG/VMEMMAP which is the existing best performing implementation for page_to_pfn and vice versa. Please lets stop the polarization. We want one consistent scheme to manage memory everywhere. I do not care what its called as long as it covers all the bases and is not a glaring performance regresssion (like SPARSEMEM so far). > I have noticed before that sparsemem should be able to cover the flatmem > case if we make MAX_PHYSMEM_BITS == SECTION_SIZE_BITS and massage from > there. Right. But for embedded the memorymap base cannot be constant because they may not be able to have a fixed address in memory. So memory map needs to become a variable. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 21:28 ` Christoph Lameter @ 2007-04-02 21:43 ` Martin Bligh 2007-04-02 21:56 ` Dave Hansen 2007-04-04 21:27 ` Bob Picco 2 siblings, 0 replies; 46+ messages in thread From: Martin Bligh @ 2007-04-02 21:43 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki, Andy Whitcroft > Note that these arguments on DISCONTIG are flame bait for many SGIers. > We usually see this as an attack on DISCONTIG/VMEMMAP which is the > existing best performing implementation for page_to_pfn and vice > versa. Please lets stop the polarization. We want one consistent scheme > to manage memory everywhere. I do not care what its called as long as it > covers all the bases and is not a glaring performance regresssion (like > SPARSEMEM so far). The main conceptual difference (in my mind) was not having one bastardized data structure (pg_data_t) that meant different things in different situations (is it a node, or a section of discontig mem?). Also we didn't support discontig mem within a node (at least with the old discontigmem), which was partly the result of that hybridization. Beyond that, it's just naming really. M. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 21:28 ` Christoph Lameter 2007-04-02 21:43 ` Martin Bligh @ 2007-04-02 21:56 ` Dave Hansen 2007-04-02 22:29 ` Christoph Lameter 2007-04-02 22:31 ` Andi Kleen 2007-04-04 21:27 ` Bob Picco 2 siblings, 2 replies; 46+ messages in thread From: Dave Hansen @ 2007-04-02 21:56 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote: > I do not care what its called as long as it > covers all the bases and is not a glaring performance regresssion (like > SPARSEMEM so far). I honestly don't doubt that there are regressions, somewhere. Could you elaborate, and perhaps actually show us some numbers on this? Perhaps instead of adding a completely new model, we can adapt the existing ones somehow. But, without some cold, hard, data, we mere mortals without the 1024-way machines can only guess. ;) -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 21:56 ` Dave Hansen @ 2007-04-02 22:29 ` Christoph Lameter 2007-04-02 22:31 ` Andi Kleen 1 sibling, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 22:29 UTC (permalink / raw) To: Dave Hansen Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote: > > I do not care what its called as long as it > > covers all the bases and is not a glaring performance regresssion (like > > SPARSEMEM so far). > > I honestly don't doubt that there are regressions, somewhere. Could you > elaborate, and perhaps actually show us some numbers on this? Perhaps > instead of adding a completely new model, we can adapt the existing ones > somehow. Just look at pfn_to_page and friends on ia64 and see what atrocities sparsemem does with those if you enable it. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 21:56 ` Dave Hansen 2007-04-02 22:29 ` Christoph Lameter @ 2007-04-02 22:31 ` Andi Kleen 2007-04-02 22:37 ` Christoph Lameter 2007-04-05 12:07 ` Andy Whitcroft 1 sibling, 2 replies; 46+ messages in thread From: Andi Kleen @ 2007-04-02 22:31 UTC (permalink / raw) To: Dave Hansen Cc: Christoph Lameter, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Monday 02 April 2007 23:56:08 Dave Hansen wrote: > On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote: > > I do not care what its called as long as it > > covers all the bases and is not a glaring performance regresssion (like > > SPARSEMEM so far). > > I honestly don't doubt that there are regressions, somewhere. Could you > elaborate, and perhaps actually show us some numbers on this? Perhaps > instead of adding a completely new model, we can adapt the existing ones > somehow. If it works I would be inclined to replaced old sparsemem with Christoph's new one on x86-64. Perhaps that could cut down the bewildering sparsemem ifdef jungle that is there currently. But I presume it won't work on 32bit because of the limited address space? > But, without some cold, hard, data, we mere mortals without the 1024-way > machines can only guess. ;) Yep. -Andi (who would be scared of a 1024 way x86 machine) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 22:31 ` Andi Kleen @ 2007-04-02 22:37 ` Christoph Lameter 2007-04-02 22:41 ` Martin Bligh 2007-04-05 12:07 ` Andy Whitcroft 1 sibling, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 22:37 UTC (permalink / raw) To: Andi Kleen Cc: Dave Hansen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Tue, 3 Apr 2007, Andi Kleen wrote: > If it works I would be inclined to replaced old sparsemem with Christoph's > new one on x86-64. Perhaps that could cut down the bewildering sparsemem > ifdef jungle that is there currently. > > But I presume it won't work on 32bit because of the limited address space? Not in general but it will work in non PAE mode. 4GB need 2^(32-21+4) = 16MB pages. This would require the mapping on 4 4MB pages. For 64GB you'd need 256M which would be a quarter of low mem. Probably takes up too much of low mem. > > But, without some cold, hard, data, we mere mortals without the 1024-way > > machines can only guess. ;) > > Yep. > > -Andi (who would be scared of a 1024 way x86 machine) - Christoph (who does not believe that Andi has to wait long for something bigger) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 22:37 ` Christoph Lameter @ 2007-04-02 22:41 ` Martin Bligh 2007-04-02 22:49 ` Christoph Lameter 0 siblings, 1 reply; 46+ messages in thread From: Martin Bligh @ 2007-04-02 22:41 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, Dave Hansen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki Christoph Lameter wrote: > On Tue, 3 Apr 2007, Andi Kleen wrote: > >> If it works I would be inclined to replaced old sparsemem with Christoph's >> new one on x86-64. Perhaps that could cut down the bewildering sparsemem >> ifdef jungle that is there currently. >> >> But I presume it won't work on 32bit because of the limited address space? > > Not in general but it will work in non PAE mode. 4GB need 2^(32-21+4) = > 16MB pages. This would require the mapping on 4 4MB pages. IIRC NUMA isn't supported (or useful anyway) without PAE. > For 64GB you'd need 256M which would be a quarter of low mem. Probably > takes up too much of low mem. Yup. M. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 22:41 ` Martin Bligh @ 2007-04-02 22:49 ` Christoph Lameter 2007-04-02 22:52 ` Martin Bligh 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 22:49 UTC (permalink / raw) To: Martin Bligh Cc: Andi Kleen, Dave Hansen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Martin Bligh wrote: > > For 64GB you'd need 256M which would be a quarter of low mem. Probably takes > > up too much of low mem. > > Yup. We could move whatever you currently use to handle that into i386 arch code. Or are there other platforms that do similar tricks with highmem? We already have special hooks for node lookups in sparsemem. Move all of that off into some arch dir? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 22:49 ` Christoph Lameter @ 2007-04-02 22:52 ` Martin Bligh 0 siblings, 0 replies; 46+ messages in thread From: Martin Bligh @ 2007-04-02 22:52 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, Dave Hansen, linux-kernel, linux-arch, linux-mm, KAMEZAWA Hiroyuki Christoph Lameter wrote: > On Mon, 2 Apr 2007, Martin Bligh wrote: > >>> For 64GB you'd need 256M which would be a quarter of low mem. Probably takes >>> up too much of low mem. >> Yup. > > We could move whatever you currently use to handle that into i386 arch > code. Or are there other platforms that do similar tricks with highmem? > > We already have special hooks for node lookups in sparsemem. Move all of > that off into some arch dir? Well, all I did was basically an early vmalloc kind of thing. You only need to allocate enough virtual space for how much memory you actually *have*, not the full set. The problem on i386 is that you just need to reserve that space early, in order to shuffle everything else into fit. It's messy, but not hard. M. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 22:31 ` Andi Kleen 2007-04-02 22:37 ` Christoph Lameter @ 2007-04-05 12:07 ` Andy Whitcroft 1 sibling, 0 replies; 46+ messages in thread From: Andy Whitcroft @ 2007-04-05 12:07 UTC (permalink / raw) To: Andi Kleen Cc: Dave Hansen, Christoph Lameter, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki Andi Kleen wrote: > On Monday 02 April 2007 23:56:08 Dave Hansen wrote: >> On Mon, 2007-04-02 at 14:28 -0700, Christoph Lameter wrote: >>> I do not care what its called as long as it >>> covers all the bases and is not a glaring performance regresssion (like >>> SPARSEMEM so far). >> I honestly don't doubt that there are regressions, somewhere. Could you >> elaborate, and perhaps actually show us some numbers on this? Perhaps >> instead of adding a completely new model, we can adapt the existing ones >> somehow. > > If it works I would be inclined to replaced old sparsemem with Christoph's > new one on x86-64. Perhaps that could cut down the bewildering sparsemem > ifdef jungle that is there currently. > > But I presume it won't work on 32bit because of the limited address space? Right. But we might be able to do switch SPARSEMEM_EXTREME users here if performance is better and no other regressions are detected. There seems to be a theme, we need to get some numbers. I will try and get what I can with the hardware I have and see whats missing. > >> But, without some cold, hard, data, we mere mortals without the 1024-way >> machines can only guess. ;) -apw -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 21:28 ` Christoph Lameter 2007-04-02 21:43 ` Martin Bligh 2007-04-02 21:56 ` Dave Hansen @ 2007-04-04 21:27 ` Bob Picco 2007-04-04 22:38 ` Christoph Lameter 2 siblings, 1 reply; 46+ messages in thread From: Bob Picco @ 2007-04-04 21:27 UTC (permalink / raw) To: Christoph Lameter Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki Christoph Lameter wrote: [Mon Apr 02 2007, 05:28:30PM EDT] > On Mon, 2 Apr 2007, Dave Hansen wrote: > > > On Mon, 2007-04-02 at 13:30 -0700, Christoph Lameter wrote: > > > On Mon, 2 Apr 2007, Dave Hansen wrote: > > > > I completely agree, it looks like it should be faster. The code > > > > certainly has potential benefits. But, to add this neato, apparently > > > > more performant feature, we unfortunately have to add code. Adding the > > > > code has a cost: code maintenance. This isn't a runtime cost, but it is > > > > a real, honest to goodness tradeoff. > > > > > > Its just the opposite. The vmemmap code is so efficient that we can remove > > > lots of other code and gops of these alternate implementations. > > > > We do want to make sure that there isn't anyone relying on these. Are > > you thinking of simple sparsemem vs. extreme vs. sparsemem vmemmap? Or, > > are you thinking of sparsemem vs. discontig? > > I am thinking sparsemem default and then get rid discontig, flatmem etc. > On many platforms this will work. Flatmem for embedded could just be a > variation on sparse_virtual. > > > Amen, brother. I'd love to see DISCONTIG die, with sufficient testing, > > of course. Andi, do you have any ideas on how to get sparsemem out of > > the 'experimental' phase? > > Note that these arguments on DISCONTIG are flame bait for many SGIers. > We usually see this as an attack on DISCONTIG/VMEMMAP which is the > existing best performing implementation for page_to_pfn and vice > versa. Please lets stop the polarization. We want one consistent scheme > to manage memory everywhere. I do not care what its called as long as it > covers all the bases and is not a glaring performance regresssion (like > SPARSEMEM so far). Well you must have forgotten about these two postings in regards to performance numbers: http://marc.info/?l=linux-ia64&m=111990276501051&w=2 and http://marc.info/?l=linux-kernel&m=116664638611634&w=2 . I took your first patchset and ran some numbers on an amd64 machine with two dual core sockets and 4Gb of memory. More iterations should be done and perhaps larger number of tasks. The aim7 numbers are below. bob 2.6.21-rc5+sparsemem Benchmark Version Machine Run Date AIM Multiuser Benchmark - Suite VII "1.1" rcc5 Apr 2 05:04:33 2007 Tasks Jobs/Min JTI Real CPU Jobs/sec/task 1 13.8 100 421.3 2.2 0.2303 101 527.8 97 1113.8 111.5 0.0871 201 565.0 97 2070.6 222.7 0.0468 301 570.9 96 3068.7 334.7 0.0316 401 573.0 97 4072.7 445.6 0.0238 501 583.3 99 4998.5 558.6 0.0194 601 583.8 99 5991.1 672.9 0.0162 2.6.21-rc5+sparsemem+patchset Benchmark Version Machine Run Date AIM Multiuser Benchmark - Suite VII "1.1" vmem Apr 4 02:22:24 2007 Tasks Jobs/Min JTI Real CPU Jobs/sec/task 1 13.7 100 424.0 2.1 0.2288 101 500.3 97 1175.0 112.0 0.0826 201 554.2 97 2111.0 223.6 0.0460 301 578.5 97 3028.3 334.9 0.0320 401 586.2 97 3981.3 448.1 0.0244 501 584.2 99 4990.8 561.8 0.0194 601 584.4 98 5985.2 675.5 0.0162 > > > I have noticed before that sparsemem should be able to cover the flatmem > > case if we make MAX_PHYSMEM_BITS == SECTION_SIZE_BITS and massage from > > there. > > Right. But for embedded the memorymap base cannot be constant because > they may not be able to have a fixed address in memory. So memory map > needs to become a variable. > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-04 21:27 ` Bob Picco @ 2007-04-04 22:38 ` Christoph Lameter 0 siblings, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-04 22:38 UTC (permalink / raw) To: Bob Picco Cc: Dave Hansen, Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Wed, 4 Apr 2007, Bob Picco wrote: > Well you must have forgotten about these two postings in regards to > performance numbers: > http://marc.info/?l=linux-ia64&m=111990276501051&w=2 > and > http://marc.info/?l=linux-kernel&m=116664638611634&w=2 I am well aware of those but those were done with a PAGE_SIZE vmemmap which is particularly bad on IA64 given the TLB fault overhead. You eliminated the TLB fault overhead. Virtual Memmaps need to be designed in such a way that they do not create additional overhead. The x86_64 version here has no such overhead that you could eliminate with lookup tables. The bad thing is that this benchmark then was used to justify sparsemem on other platforms where such overhead does not exist. One needs to be careful with benchmarks..... Its better to review the code. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-02 15:37 ` Christoph Lameter 2007-04-02 15:44 ` Andi Kleen 2007-04-02 20:13 ` Dave Hansen @ 2007-04-02 23:03 ` Bjorn Helgaas 2 siblings, 0 replies; 46+ messages in thread From: Bjorn Helgaas @ 2007-04-02 23:03 UTC (permalink / raw) To: Christoph Lameter Cc: Andi Kleen, linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Monday 02 April 2007 09:37, Christoph Lameter wrote: > On Sun, 1 Apr 2007, Andi Kleen wrote: > > Hmm, this means there is at least 2MB worth of struct page on every node? > > Or do you have overlaps with other memory (I think you have) > > In that case you have to handle the overlap in change_page_attr() > > Correct. 2MB worth of struct page is 128 mb of memory. Are there nodes > with smaller amounts of memory? Do you deal with max_addr= and mem=? RHEL4 (2.6.9) blows up if max_addr= happens to leave you with CPU-only nodes. So hopefully you can deal with arbitrary-sized nodes caused by max_addr= or mem=. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-01 10:46 ` Andi Kleen 2007-04-02 15:37 ` Christoph Lameter @ 2007-04-02 15:44 ` Christoph Lameter 2007-04-02 16:18 ` Christoph Lameter 2 siblings, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 15:44 UTC (permalink / raw) To: Andi Kleen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Sun, 1 Apr 2007, Andi Kleen wrote: > And when you reserve virtual space somewhere you should > update Documentation/x86_64/mm.txt. Also you didn't adjust > the end of the vmalloc area so in theory vmalloc could run > into your vmemmap. Ok. will add to the doc in the next release. No need to adjust the end of the vmalloc area because the vmemmap starts at the end of it: include/asm-x86_64/pgtable.h: #define VMALLOC_START 0xffffc20000000000UL #define VMALLOC_END 0xffffe1ffffffffffUL Index: linux-2.6.21-rc5-mm2/include/asm-x86_64/page.h #define vmemmap ((struct page *)0xffffe20000000000UL) According to Documentation/x86_64/mm.txt this is an unused hole: ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space ... unused hole ... ffffffff80000000 - ffffffff82800000 (=40 MB) kernel text mapping, from phys 0 ... unused hole ... -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL 2007-04-01 10:46 ` Andi Kleen 2007-04-02 15:37 ` Christoph Lameter 2007-04-02 15:44 ` Christoph Lameter @ 2007-04-02 16:18 ` Christoph Lameter 2 siblings, 0 replies; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 16:18 UTC (permalink / raw) To: Andi Kleen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, KAMEZAWA Hiroyuki On Sun, 1 Apr 2007, Andi Kleen wrote: > Or do you have overlaps with other memory (I think you have) We may get into a case where some page structs are physically located on other nodes if there are no holes between nodes. This would be particularly significant for 64MB node sizes using numa emulation. But there it really does not matter where the real memory is located. (this is really inherent in sparsemems way of mapping memory). > In that case you have to handle the overlap in change_page_attr() Why would we use change_page_attr on page structs? I can see that the pages themselves should be subject to change_page_attr but why the page_structs? I think the system would panic if one would set page structs to read only. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-01 7:10 [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Christoph Lameter 2007-04-01 7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter @ 2007-04-02 20:50 ` Dave Hansen 2007-04-02 21:00 ` Christoph Lameter 1 sibling, 1 reply; 46+ messages in thread From: Dave Hansen @ 2007-04-02 20:50 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki First of all, nice set of patches. On Sat, 2007-03-31 at 23:10 -0800, Christoph Lameter wrote: > --- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h 2007-03-31 22:47:14.000000000 -0700 > +++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h 2007-03-31 22:59:35.000000000 -0700 > @@ -47,6 +47,13 @@ > }) > > #elif defined(CONFIG_SPARSEMEM) > +#ifdef CONFIG_SPARSE_VIRTUAL > +/* > + * We have a virtual memmap that makes lookups very simple > + */ > +#define __pfn_to_page(pfn) (vmemmap + (pfn)) > +#define __page_to_pfn(page) ((page) - vmemmap) > +#else > /* > * Note: section's mem_map is encorded to reflect its start_pfn. > * section[i].section_mem_map == mem_map's address - start_pfn; > @@ -62,6 +69,7 @@ > struct mem_section *__sec = __pfn_to_section(__pfn); \ > __section_mem_map_addr(__sec) + __pfn; \ > }) > +#endif > #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ Any chance this can be done without embedding this inside another #ifdef? I really hate untangling the mess when an #endif goes missing. Any reason this can't just be another #elif? > #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE > Index: linux-2.6.21-rc5-mm2/mm/sparse.c > =================================================================== > --- linux-2.6.21-rc5-mm2.orig/mm/sparse.c 2007-03-31 22:47:14.000000000 -0700 > +++ linux-2.6.21-rc5-mm2/mm/sparse.c 2007-03-31 22:59:35.000000000 -0700 > @@ -9,6 +9,7 @@ > #include <linux/spinlock.h> > #include <linux/vmalloc.h> > #include <asm/dma.h> > +#include <asm/pgalloc.h> > > /* > * Permanent SPARSEMEM data: > @@ -101,7 +102,7 @@ static inline int sparse_index_init(unsi > > /* > * Although written for the SPARSEMEM_EXTREME case, this happens > - * to also work for the flat array case becase > + * to also work for the flat array case because > * NR_SECTION_ROOTS==NR_MEM_SECTIONS. > */ > int __section_nr(struct mem_section* ms) > @@ -211,6 +212,90 @@ static int sparse_init_one_section(struc > return 1; > } > > +#ifdef CONFIG_SPARSE_VIRTUAL > + > +void *vmemmap_alloc_block(unsigned long size, int node) > +{ > + if (slab_is_available()) { > + struct page *page = > + alloc_pages_node(node, GFP_KERNEL, > + get_order(size)); > + > + BUG_ON(!page); > + return page_address(page); > + } else > + return __alloc_bootmem_node(NODE_DATA(node), size, size, > + __pa(MAX_DMA_ADDRESS)); > +} Hmmmmmmm. Can we combine this with sparse_index_alloc()? Also, why not just use the slab for this? Let's get rid of the _block() part, too. I'm not sure it does any good. At least make it _bytes() so that we know what the units are. Also, if you're just going to round up internally and _not_ use the slab, can you just make the argument in pages, or even order? Can you think of any times when we'd want that BUG_ON() to be a WARN_ON(), instead? I can see preferring having my mem_map[] on the wrong node than hitting a BUG(). > +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP > +/* > + * Virtual memmap populate functionality for architectures that support > + * PMDs for huge pages like i386, x86_64 etc. > + */ How about: /* * Virtual memmap support for architectures that use Linux pagetables * natively in hardware, and support mapping huge pages with PMD * entries. */ It wouldn't make sense to map the vmemmap area with Linux pagetables on an arch that didn't use them in hardware, right? So, perhaps this doesn't quite belong in mm/sparse.c. Perhaps we need arch/x86/sparse.c. ;) > +static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr, > + unsigned long end, int node) > +{ > + pmd_t *pmd; > + > + end = pmd_addr_end(addr, end); > + > + for (pmd = pmd_offset(pud, addr); addr < end; > + pmd++, addr += PMD_SIZE) { > + if (pmd_none(*pmd)) { > + void *block; > + pte_t pte; > + > + block = vmemmap_alloc_block(PMD_SIZE, node); > + pte = pfn_pte(__pa(block) >> PAGE_SHIFT, > + PAGE_KERNEL); > + pte_mkdirty(pte); > + pte_mkwrite(pte); > + pte_mkyoung(pte); > + mk_pte_huge(pte); > + set_pmd(pmd, __pmd(pte_val(pte))); > + } > + } > +} Nitpick: I think this would look quite a bit neater with a little less indentation. How about making the loop start with if (!pmd_none(*pmd)) continue; It should bring the rest of the code in a bit and make that long line more readable. > +static void vmemmap_pop_pud(pgd_t *pgd, unsigned long addr, > + unsigned long end, int node) > +{ > + pud_t *pud; > + > + end = pud_addr_end(addr, end); > + for (pud = pud_offset(pgd, addr); addr < end; > + pud++, addr += PUD_SIZE) { > + > + if (pud_none(*pud)) > + pud_populate(&init_mm, pud, > + vmemmap_alloc_block(PAGE_SIZE, node)); > + > + vmemmap_pop_pmd(pud, addr, end, node); > + } > +} > + > +static void vmemmap_populate(struct page *start_page, unsigned long nr, > + int node) > +{ > + pgd_t *pgd; > + unsigned long addr = (unsigned long)(start_page); > + unsigned long end = pgd_addr_end(addr, > + (unsigned long)((start_page + nr))); There appear to be a few extra parentheses on these lines. > + for (pgd = pgd_offset_k(addr); addr < end; > + pgd++, addr += PGDIR_SIZE) { > + > + if (pgd_none(*pgd)) > + pgd_populate(&init_mm, pgd, > + vmemmap_alloc_block(PAGE_SIZE, node)); > + vmemmap_pop_pud(pgd, addr, end, node); > + } > +} > +#endif > +#endif /* CONFIG_SPARSE_VIRTUAL */ We don't really need these #ifdefs embedded inside of each other, either, right? Kconfig should take care of enforcing the dependency. > static struct page *sparse_early_mem_map_alloc(unsigned long pnum) > { > struct page *map; > @@ -221,8 +306,13 @@ static struct page *sparse_early_mem_map > if (map) > return map; > > +#ifdef CONFIG_SPARSE_VIRTUAL > + map = pfn_to_page(pnum * PAGES_PER_SECTION); > + vmemmap_populate(map, PAGES_PER_SECTION, nid); > +#else > map = alloc_bootmem_node(NODE_DATA(nid), > sizeof(struct page) * PAGES_PER_SECTION); > +#endif We really worked hard to keep #ifdefs out of the code flow in that file and keep it as clean as possible. Could we hide this behind a helper? map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; + map = alloc_vmemmap(map, PAGES_PER_SECTION, nid); + if (map) + return map; + map = alloc_bootmem_node(NODE_DATA(nid), sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; Then, do whatever magic you want in alloc_vmemmap(). -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen @ 2007-04-02 21:00 ` Christoph Lameter 2007-04-02 21:22 ` Dave Hansen 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 21:00 UTC (permalink / raw) To: Dave Hansen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > First of all, nice set of patches. > > On Sat, 2007-03-31 at 23:10 -0800, Christoph Lameter wrote: > > --- linux-2.6.21-rc5-mm2.orig/include/asm-generic/memory_model.h 2007-03-31 22:47:14.000000000 -0700 > > +++ linux-2.6.21-rc5-mm2/include/asm-generic/memory_model.h 2007-03-31 22:59:35.000000000 -0700 > > @@ -47,6 +47,13 @@ > > }) > > > > #elif defined(CONFIG_SPARSEMEM) > > +#ifdef CONFIG_SPARSE_VIRTUAL > > +/* > > + * We have a virtual memmap that makes lookups very simple > > + */ > > +#define __pfn_to_page(pfn) (vmemmap + (pfn)) > > +#define __page_to_pfn(page) ((page) - vmemmap) > > +#else > > /* > > * Note: section's mem_map is encorded to reflect its start_pfn. > > * section[i].section_mem_map == mem_map's address - start_pfn; > > @@ -62,6 +69,7 @@ > > struct mem_section *__sec = __pfn_to_section(__pfn); \ > > __section_mem_map_addr(__sec) + __pfn; \ > > }) > > +#endif > > #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ > > Any chance this can be done without embedding this inside another > #ifdef? I really hate untangling the mess when an #endif goes > missing. > > Any reason this can't just be another #elif? Sure. > > + } else > > + return __alloc_bootmem_node(NODE_DATA(node), size, size, > > + __pa(MAX_DMA_ADDRESS)); > > +} > > Hmmmmmmm. Can we combine this with sparse_index_alloc()? Also, why not > just use the slab for this? Use a slab for page sized allocations? No. > Let's get rid of the _block() part, too. I'm not sure it does any good. > At least make it _bytes() so that we know what the units are. Also, if > you're just going to round up internally and _not_ use the slab, can you > just make the argument in pages, or even order? Its used for page sized allocations. > Can you think of any times when we'd want that BUG_ON() to be a > WARN_ON(), instead? I can see preferring having my mem_map[] on the > wrong node than hitting a BUG(). We should probably have some error handling there instead of the BUG. > > +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP > > +/* > > + * Virtual memmap populate functionality for architectures that support > > + * PMDs for huge pages like i386, x86_64 etc. > > + */ > > How about: > > /* > * Virtual memmap support for architectures that use Linux pagetables > * natively in hardware, and support mapping huge pages with PMD > * entries. > */ > > It wouldn't make sense to map the vmemmap area with Linux pagetables on > an arch that didn't use them in hardware, right? So, perhaps this > doesn't quite belong in mm/sparse.c. Perhaps we need > arch/x86/sparse.c. ;) I just extended this in V2 to also work on IA64. Its pretty generic. > > +static void vmemmap_pop_pmd(pud_t *pud, unsigned long addr, > > + unsigned long end, int node) > > +{ > > + pmd_t *pmd; > > + > > + end = pmd_addr_end(addr, end); > > + > > + for (pmd = pmd_offset(pud, addr); addr < end; > > + pmd++, addr += PMD_SIZE) { > > + if (pmd_none(*pmd)) { > > + void *block; > > + pte_t pte; > > + > > + block = vmemmap_alloc_block(PMD_SIZE, node); > > + pte = pfn_pte(__pa(block) >> PAGE_SHIFT, > > + PAGE_KERNEL); > > + pte_mkdirty(pte); > > + pte_mkwrite(pte); > > + pte_mkyoung(pte); > > + mk_pte_huge(pte); > > + set_pmd(pmd, __pmd(pte_val(pte))); > > + } > > + } > > +} > > Nitpick: I think this would look quite a bit neater with a little less > indentation. > > How about making the loop start with > > if (!pmd_none(*pmd)) > continue; > > It should bring the rest of the code in a bit and make that long line > more readable. V2 has the pmd setup separated out in a separate function. > > +{ > > + pgd_t *pgd; > > + unsigned long addr = (unsigned long)(start_page); > > + unsigned long end = pgd_addr_end(addr, > > + (unsigned long)((start_page + nr))); > > There appear to be a few extra parentheses on these lines. Right. Fixed. > > + for (pgd = pgd_offset_k(addr); addr < end; > > + pgd++, addr += PGDIR_SIZE) { > > + > > + if (pgd_none(*pgd)) > > + pgd_populate(&init_mm, pgd, > > + vmemmap_alloc_block(PAGE_SIZE, node)); > > + vmemmap_pop_pud(pgd, addr, end, node); > > + } > > +} > > +#endif > > +#endif /* CONFIG_SPARSE_VIRTUAL */ > > We don't really need these #ifdefs embedded inside of each other, > either, right? Kconfig should take care of enforcing the dependency. Ok. > > +#ifdef CONFIG_SPARSE_VIRTUAL > > + map = pfn_to_page(pnum * PAGES_PER_SECTION); > > + vmemmap_populate(map, PAGES_PER_SECTION, nid); > > +#else > > map = alloc_bootmem_node(NODE_DATA(nid), > > sizeof(struct page) * PAGES_PER_SECTION); > > +#endif > > We really worked hard to keep #ifdefs out of the code flow in that file > and keep it as clean as possible. Could we hide this behind a helper? This is a major diffference in how sparsemem works. I think this needs to stay. > > map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); > if (map) > return map; > > + map = alloc_vmemmap(map, PAGES_PER_SECTION, nid); > + if (map) > + return map; > + > map = alloc_bootmem_node(NODE_DATA(nid), > sizeof(struct page) * PAGES_PER_SECTION); > if (map) > return map; > > Then, do whatever magic you want in alloc_vmemmap(). That would break if alloc_vmemmap returns NULL because it cannot allocate memory. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-02 21:00 ` Christoph Lameter @ 2007-04-02 21:22 ` Dave Hansen 2007-04-02 21:31 ` Christoph Lameter 0 siblings, 1 reply; 46+ messages in thread From: Dave Hansen @ 2007-04-02 21:22 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 14:00 -0700, Christoph Lameter wrote: > On Mon, 2 Apr 2007, Dave Hansen wrote: > > > + } else > > > + return __alloc_bootmem_node(NODE_DATA(node), size, size, > > > + __pa(MAX_DMA_ADDRESS)); > > > +} > > > > Hmmmmmmm. Can we combine this with sparse_index_alloc()? Also, why not > > just use the slab for this? > > Use a slab for page sized allocations? No. Why not? We use it above for sparse_index_alloc() and if it is doing something wrong, I'd love to fix it. Can you elaborate? > > Let's get rid of the _block() part, too. I'm not sure it does any good. > > At least make it _bytes() so that we know what the units are. Also, if > > you're just going to round up internally and _not_ use the slab, can you > > just make the argument in pages, or even order? > > Its used for page sized allocations. Ok, then let's make it take pages of some kind or its argument. > > Can you think of any times when we'd want that BUG_ON() to be a > > WARN_ON(), instead? I can see preferring having my mem_map[] on the > > wrong node than hitting a BUG(). > > We should probably have some error handling there instead of the BUG. > > > > +#ifndef ARCH_POPULATES_VIRTUAL_MEMMAP > > > +/* > > > + * Virtual memmap populate functionality for architectures that support > > > + * PMDs for huge pages like i386, x86_64 etc. > > > + */ > > > > How about: > > > > /* > > * Virtual memmap support for architectures that use Linux pagetables > > * natively in hardware, and support mapping huge pages with PMD > > * entries. > > */ > > > > It wouldn't make sense to map the vmemmap area with Linux pagetables on > > an arch that didn't use them in hardware, right? So, perhaps this > > doesn't quite belong in mm/sparse.c. Perhaps we need > > arch/x86/sparse.c. ;) > > I just extended this in V2 to also work on IA64. Its pretty generic. Can you extend it to work on ppc? ;) You haven't posted V2, right? > > > > map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); > > if (map) > > return map; > > > > + map = alloc_vmemmap(map, PAGES_PER_SECTION, nid); > > + if (map) > > + return map; > > + > > map = alloc_bootmem_node(NODE_DATA(nid), > > sizeof(struct page) * PAGES_PER_SECTION); > > if (map) > > return map; > > > > Then, do whatever magic you want in alloc_vmemmap(). > > That would break if alloc_vmemmap returns NULL because it cannot allocate > memory. OK, that makes sense. However, it would still be nice to hide that #ifdef somewhere that people are a bit less likely to run into it. It's just one #ifdef, so if you can kill it, great. Otherwise, they pile up over time and _do_ cause real readability problems. -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-02 21:22 ` Dave Hansen @ 2007-04-02 21:31 ` Christoph Lameter 2007-04-02 21:42 ` Dave Hansen 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 21:31 UTC (permalink / raw) To: Dave Hansen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > > > Hmmmmmmm. Can we combine this with sparse_index_alloc()? Also, why not > > > just use the slab for this? > > > > Use a slab for page sized allocations? No. > > Why not? We use it above for sparse_index_alloc() and if it is doing > something wrong, I'd love to fix it. Can you elaborate? The slab allocator purposes is to deliver small sub page sized chunks. The page allocator is there to allocate pages. Both are optimized for its purpose. > > I just extended this in V2 to also work on IA64. Its pretty generic. > > Can you extend it to work on ppc? ;) I do not know enough about how ppc handles large pages. > You haven't posted V2, right? No tests are still running. > > > Then, do whatever magic you want in alloc_vmemmap(). > > > > That would break if alloc_vmemmap returns NULL because it cannot allocate > > memory. > > OK, that makes sense. However, it would still be nice to hide that > #ifdef somewhere that people are a bit less likely to run into it. It's > just one #ifdef, so if you can kill it, great. Otherwise, they pile up > over time and _do_ cause real readability problems. Well think about how to handle the case that the allocatiopn of a page table page or a vmemmap block fails. Once we have that sorted out then we can cleanup the higher layers. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-02 21:31 ` Christoph Lameter @ 2007-04-02 21:42 ` Dave Hansen 2007-04-02 21:53 ` Christoph Lameter 0 siblings, 1 reply; 46+ messages in thread From: Dave Hansen @ 2007-04-02 21:42 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 14:31 -0700, Christoph Lameter wrote: > On Mon, 2 Apr 2007, Dave Hansen wrote: > > > > > Hmmmmmmm. Can we combine this with sparse_index_alloc()? Also, why not > > > > just use the slab for this? > > > > > > Use a slab for page sized allocations? No. > > > > Why not? We use it above for sparse_index_alloc() and if it is doing > > something wrong, I'd love to fix it. Can you elaborate? > > The slab allocator purposes is to deliver small sub page sized chunks. > The page allocator is there to allocate pages. Both are optimized for its > purpose. I understand that, in general, but how does that optimization help, here, exactly? My argument is that if we use the slab, it means more code sharing with existing code in sparse.c. Other than ideology, what practical reasons are there in this case that keep us from doing that otherwise attractive code sharing. > > > I just extended this in V2 to also work on IA64. Its pretty generic. > > > > Can you extend it to work on ppc? ;) > > I do not know enough about how ppc handles large pages. I don't think the pagetable walks are generic enough to ever get used on ppc, unless they start walking the Linux pagetables for the kernel virtual address area. I was trying to poke you into getting the pagetable walks out of sparse.c. ;) > > > > Then, do whatever magic you want in alloc_vmemmap(). > > > > > > That would break if alloc_vmemmap returns NULL because it cannot allocate > > > memory. > > > > OK, that makes sense. However, it would still be nice to hide that > > #ifdef somewhere that people are a bit less likely to run into it. It's > > just one #ifdef, so if you can kill it, great. Otherwise, they pile up > > over time and _do_ cause real readability problems. > > Well think about how to handle the case that the allocatiopn of a page > table page or a vmemmap block fails. Once we have that sorted out then we > can cleanup the higher layers. I think it is best to just completely replace sparse_early_mem_map_alloc() for the vmemmap case. It really is a completely different beast. You'd never, for instance, have alloc_remap() come into play. -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-02 21:42 ` Dave Hansen @ 2007-04-02 21:53 ` Christoph Lameter 2007-04-02 22:05 ` Dave Hansen 0 siblings, 1 reply; 46+ messages in thread From: Christoph Lameter @ 2007-04-02 21:53 UTC (permalink / raw) To: Dave Hansen Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki On Mon, 2 Apr 2007, Dave Hansen wrote: > > The slab allocator purposes is to deliver small sub page sized chunks. > > The page allocator is there to allocate pages. Both are optimized for its > > purpose. > > I understand that, in general, but how does that optimization help, > here, exactly? My argument is that if we use the slab, it means more > code sharing with existing code in sparse.c. Other than ideology, what > practical reasons are there in this case that keep us from doing that > otherwise attractive code sharing. F.e. you are wasting memory that the slab needs to uselessly track these allocations. You would need to create a special slab that has page sized (or huge page sized) alignment to have the proper allocation behavior. Not good. The rest of sparse is not MMU bound. So you may be fine. I'd recommend though to use the page allocator if you are doing large allocations. I do not see the point of using slab there. > I don't think the pagetable walks are generic enough to ever get used on > ppc, unless they start walking the Linux pagetables for the kernel > virtual address area. I was trying to poke you into getting the > pagetable walks out of sparse.c. ;) If I would be doing that then we would end up adding these pagetable walks to multiple architectures. I already need to cover IA64 and x86_64 and this will also do i386. Lets try to keep them generic. PPC may need to disable these walks and do its own thing. > > Well think about how to handle the case that the allocatiopn of a page > > table page or a vmemmap block fails. Once we have that sorted out then we > > can cleanup the higher layers. > > I think it is best to just completely replace > sparse_early_mem_map_alloc() for the vmemmap case. It really is a > completely different beast. You'd never, for instance, have > alloc_remap() come into play. What is the purpose of alloc_remap? Could not figure that one out. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM 2007-04-02 21:53 ` Christoph Lameter @ 2007-04-02 22:05 ` Dave Hansen 0 siblings, 0 replies; 46+ messages in thread From: Dave Hansen @ 2007-04-02 22:05 UTC (permalink / raw) To: Christoph Lameter Cc: linux-kernel, linux-arch, Martin Bligh, linux-mm, Andi Kleen, KAMEZAWA Hiroyuki On Mon, 2007-04-02 at 14:53 -0700, Christoph Lameter wrote: > > > Well think about how to handle the case that the allocatiopn of a page > > > table page or a vmemmap block fails. Once we have that sorted out then we > > > can cleanup the higher layers. > > > > I think it is best to just completely replace > > sparse_early_mem_map_alloc() for the vmemmap case. It really is a > > completely different beast. You'd never, for instance, have > > alloc_remap() come into play. > > What is the purpose of alloc_remap? Could not figure that one out. That's what we use on i386 to get some lowmem area for non-zero NUMA nodes. Otherwise, all of ZONE_NORMAL is on node 0. It's a bit hokey, and stuff like virt_to_phys() probably doesn't work on it, but it has worked pretty well for a long time. -- Dave -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 46+ messages in thread
end of thread, other threads:[~2007-04-09 18:20 UTC | newest] Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2007-04-01 7:10 [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Christoph Lameter 2007-04-01 7:10 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Christoph Lameter 2007-04-01 7:11 ` Christoph Lameter 2007-04-01 10:46 ` Andi Kleen 2007-04-02 15:37 ` Christoph Lameter 2007-04-02 15:44 ` Andi Kleen 2007-04-02 15:51 ` Martin J. Bligh 2007-04-02 15:54 ` Christoph Lameter 2007-04-02 17:14 ` Andi Kleen 2007-04-02 17:34 ` Christoph Lameter 2007-04-02 19:54 ` Dave Hansen 2007-04-02 20:11 ` Christoph Lameter 2007-04-02 20:13 ` Dave Hansen 2007-04-02 20:30 ` Christoph Lameter 2007-04-02 20:38 ` Martin Bligh 2007-04-02 20:51 ` Christoph Lameter 2007-04-05 11:50 ` Andy Whitcroft 2007-04-05 18:27 ` Christoph Lameter 2007-04-07 22:06 ` [PATCH 1/4] x86_64: (SPARSE_VIRTUAL doubles sparsemem speed) Christoph Lameter 2007-04-07 22:18 ` Andi Kleen 2007-04-09 16:40 ` William Lee Irwin III 2007-04-09 17:16 ` Christoph Lameter 2007-04-09 18:20 ` William Lee Irwin III 2007-04-02 21:08 ` [PATCH 1/4] x86_64: Switch to SPARSE_VIRTUAL Dave Hansen 2007-04-02 21:28 ` Christoph Lameter 2007-04-02 21:43 ` Martin Bligh 2007-04-02 21:56 ` Dave Hansen 2007-04-02 22:29 ` Christoph Lameter 2007-04-02 22:31 ` Andi Kleen 2007-04-02 22:37 ` Christoph Lameter 2007-04-02 22:41 ` Martin Bligh 2007-04-02 22:49 ` Christoph Lameter 2007-04-02 22:52 ` Martin Bligh 2007-04-05 12:07 ` Andy Whitcroft 2007-04-04 21:27 ` Bob Picco 2007-04-04 22:38 ` Christoph Lameter 2007-04-02 23:03 ` Bjorn Helgaas 2007-04-02 15:44 ` Christoph Lameter 2007-04-02 16:18 ` Christoph Lameter 2007-04-02 20:50 ` [PATCH 1/2] Generic Virtual Memmap suport for SPARSEMEM Dave Hansen 2007-04-02 21:00 ` Christoph Lameter 2007-04-02 21:22 ` Dave Hansen 2007-04-02 21:31 ` Christoph Lameter 2007-04-02 21:42 ` Dave Hansen 2007-04-02 21:53 ` Christoph Lameter 2007-04-02 22:05 ` Dave Hansen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox