[RFC] virtual memmap for sparsemem [1/2] arch independent part

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC] virtual memmap for sparsemem [1/2] arch independent part
@ 2006-10-19  8:21 KAMEZAWA Hiroyuki
  2006-10-19 15:16 ` Andy Whitcroft
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-10-19  8:21 UTC (permalink / raw)
  To: Linux-MM; +Cc: linux-ia64

This is a patch for virtual memmap on sparsemem against 2.6.19-rc2.
booted well on my Tiger4.

In this time, this is just a RFC. comments on patch and advises for benchmarking
is welcome. (memory hotplug case is not well handled yet.)

ia64's SPARSEMEM uses SPARSEMEM_EXTREME. This requires 2-level table lookup by
software for page_to_pfn()/pfn_to_page(). virtual memmap can remove that costs.
But will consume more TLBs.

For make patches simple, pfn_valid() uses sparsemem's logic. 

- Kame
==
This patch maps sparsemem's *sparse* memmap into contiguous virtual address range
starting from virt_memmap_start.

By this, pfn_to_page, page_to_pfn can be implemented as 
#define pfn_to_page(pfn)		(virt_memmap_start + (pfn))
#define page_to_pfn(pg)			(pg - virt_memmap_start)


Difference from ia64's VIRTUAL_MEMMAP are
* pfn_valid() uses sparsemem's logic.
* memmap is allocated per SECTION_SIZE, so there will be some of RESERVED pages.
* no holes in MAX_ORDER range. so HOLE_IN_ZONE=n here.

Todo
- fix vmalloc() case in memory hotadd. (maybe __get_vm_area() can be used.)

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 include/asm-generic/memory_model.h |    7 +++
 include/linux/mmzone.h             |    8 +++
 mm/Kconfig                         |    8 +++
 mm/sparse.c                        |   85 +++++++++++++++++++++++++++++++++++--
 4 files changed, 104 insertions(+), 4 deletions(-)

Index: linux-2.6.19-rc2/mm/Kconfig
===================================================================
--- linux-2.6.19-rc2.orig/mm/Kconfig	2006-10-18 18:13:39.000000000 +0900
+++ linux-2.6.19-rc2/mm/Kconfig	2006-10-18 18:14:07.000000000 +0900
@@ -77,6 +77,14 @@
 	def_bool y
 	depends on !SPARSEMEM
 
+config VMEMMAP_SPARSEMEM
+	bool "memmap in virtual space"
+	default y
+	depends on SPARSEMEM && ARCH_VMEMMAP_SPARSEMEM_SUPPORT
+	help
+	  If this option is selected, you can speed up some kernel execution.
+	  But this consumes large amount of virtual memory area in kernel.
+
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
 # to represent different areas of memory.  This variable allows
Index: linux-2.6.19-rc2/include/asm-generic/memory_model.h
===================================================================
--- linux-2.6.19-rc2.orig/include/asm-generic/memory_model.h	2006-09-20 12:42:06.000000000 +0900
+++ linux-2.6.19-rc2/include/asm-generic/memory_model.h	2006-10-18 18:14:07.000000000 +0900
@@ -47,6 +47,7 @@
 })
 
 #elif defined(CONFIG_SPARSEMEM)
+#ifndef CONFIG_VMEMMAP_SPARSEMEM
 /*
  * Note: section's mem_map is encorded to reflect its start_pfn.
  * section[i].section_mem_map == mem_map's address - start_pfn;
@@ -62,6 +63,12 @@
 	struct mem_section *__sec = __pfn_to_section(__pfn);	\
 	__section_mem_map_addr(__sec) + __pfn;		\
 })
+#else /* CONFIG_VMEMMAP_SPARSEMEM */
+
+#define __pfn_to_page(pfn)	(virt_memmap_start + (pfn))
+#define __page_to_pfn(pg)	((unsigned long)((pg) - virt_memmap_start))
+
+#endif /* CONFIG_VMEMMAP_SPARSEMEM */
 #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
 
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
Index: linux-2.6.19-rc2/include/linux/mmzone.h
===================================================================
--- linux-2.6.19-rc2.orig/include/linux/mmzone.h	2006-10-18 18:13:39.000000000 +0900
+++ linux-2.6.19-rc2/include/linux/mmzone.h	2006-10-18 18:14:07.000000000 +0900
@@ -599,6 +599,14 @@
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
+
+#ifdef CONFIG_VMEMMAP_SPARSEMEM
+extern struct page *virt_memmap_start;
+extern void init_vmemmap_sparsemem(void *addr);
+#else
+#define init_vmemmap_sparsemem(addr)	do{}while(0)
+#endif
+
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
Index: linux-2.6.19-rc2/mm/sparse.c
===================================================================
--- linux-2.6.19-rc2.orig/mm/sparse.c	2006-09-20 12:42:06.000000000 +0900
+++ linux-2.6.19-rc2/mm/sparse.c	2006-10-19 16:58:06.000000000 +0900
@@ -9,7 +9,81 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <asm/dma.h>
+#include <asm/pgalloc.h>
 
+#ifdef CONFIG_VMEMMAP_SPARSEMEM
+struct page *virt_memmap_start;
+EXPORT_SYMBOL_GPL(virt_memmap_start);
+
+void init_vmemmap_sparsemem(void *start_addr)
+{
+	virt_memmap_start = start_addr;
+}
+
+void *pte_alloc_vmemmap(int node)
+{
+	void *ret;
+	if (system_state == SYSTEM_BOOTING) {
+		ret = alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE);
+	} else {
+		ret = kmalloc_node(PAGE_SIZE, GFP_KERNEL, node);
+		memset(ret, 0 , PAGE_SIZE);
+	}
+	BUG_ON(!ret);
+	return ret;
+}
+/*
+ * At Hot-add, vmalloc'ed memmap will never call this.
+ * They have been already in suitable address.
+ * Called only when map is allocated by alloc_bootmem()/alloc_pages()
+ */
+static void map_virtual_memmap(unsigned long section, void *map, int node)
+{
+	unsigned long vmap_start, vmap_end, vmap;
+	unsigned long pfn;
+	void *pg;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	BUG_ON (!virt_memmap_start);
+
+	pfn = section_nr_to_pfn(section);
+	vmap_start = (unsigned long)(virt_memmap_start + pfn);
+	vmap_end   = (unsigned long)(vmap_start + sizeof(struct page) * PAGES_PER_SECTION);
+
+	for (vmap = vmap_start; vmap < vmap_end; vmap += PAGE_SIZE, map += PAGE_SIZE)
+	{
+		pgd = pgd_offset_k(vmap);
+		if (pgd_none(*pgd)) {
+			pg = pte_alloc_vmemmap(node);
+			pgd_populate(&init_mm, pgd, pg);
+		}
+		pud = pud_offset(pgd, vmap);
+		if (pud_none(*pud)) {
+			pg = pte_alloc_vmemmap(node);
+			pud_populate(&init_mm, pud, pg);
+		}
+		pmd = pmd_offset(pud, vmap);
+		if (pmd_none(*pmd)) {
+			pg = pte_alloc_vmemmap(node);
+			pmd_populate_kernel(&init_mm, pmd, pg);
+		}
+		pte = pte_offset_kernel(pmd, vmap);
+		if (pte_none(*pte))
+			set_pte(pte, pfn_pte(__pa(map) >> PAGE_SHIFT, PAGE_KERNEL));
+	}
+	return;
+}
+#else /* CONFIG_VMEMMAP_SPARSEMEM */
+
+static inline void map_virtual_memmap(unsigned long section, void *map, int nid)
+{
+	return;
+}
+
+#endif /* CONFIG_VMEMMAP_SPARSEMEM */
 /*
  * Permanent SPARSEMEM data:
  *
@@ -175,13 +249,14 @@
 }
 
 static int sparse_init_one_section(struct mem_section *ms,
-		unsigned long pnum, struct page *mem_map)
+		unsigned long pnum, struct page *mem_map, int nid)
 {
 	if (!valid_section(ms))
 		return -EINVAL;
 
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+	map_virtual_memmap(pnum, mem_map, nid);
 
 	return 1;
 }
@@ -214,10 +289,11 @@
 	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
 	if (page)
 		goto got_map_page;
-
+#ifndef CONFIG_VMEMMAP_SPARSEMEM
 	ret = vmalloc(memmap_size);
 	if (ret)
 		goto got_map_ptr;
+#endif
 
 	return NULL;
 got_map_page:
@@ -261,7 +337,8 @@
 		map = sparse_early_mem_map_alloc(pnum);
 		if (!map)
 			continue;
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+					sparse_early_nid(__nr_to_section(pnum)));
 	}
 }
 
@@ -296,7 +373,7 @@
 	}
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 
-	ret = sparse_init_one_section(ms, section_nr, memmap);
+	ret = sparse_init_one_section(ms, section_nr, memmap, zone->zone_pgdat->node_id);
 
 out:
 	pgdat_resize_unlock(pgdat, &flags);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-19  8:21 [RFC] virtual memmap for sparsemem [1/2] arch independent part KAMEZAWA Hiroyuki
@ 2006-10-19 15:16 ` Andy Whitcroft
  2006-10-19 16:43   ` Christoph Lameter
  2006-10-20  1:00   ` KAMEZAWA Hiroyuki
  2006-10-19 16:39 ` Christoph Lameter
  2006-11-21 11:37 ` Heiko Carstens
  2 siblings, 2 replies; 13+ messages in thread
From: Andy Whitcroft @ 2006-10-19 15:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Linux-MM, linux-ia64, Dave Hansen

KAMEZAWA Hiroyuki wrote:
> This is a patch for virtual memmap on sparsemem against 2.6.19-rc2.
> booted well on my Tiger4.
> 
> In this time, this is just a RFC. comments on patch and advises for benchmarking
> is welcome. (memory hotplug case is not well handled yet.)
> 
> ia64's SPARSEMEM uses SPARSEMEM_EXTREME. This requires 2-level table lookup by
> software for page_to_pfn()/pfn_to_page(). virtual memmap can remove that costs.
> But will consume more TLBs.
> 
> For make patches simple, pfn_valid() uses sparsemem's logic. 

Firstly I am pleased to see that this doesn't convert the whole of
sparsemem to use a virtual map.  That had been suggested and would
really not work for 32 bit.  Good.

> 
> - Kame
> ==
> This patch maps sparsemem's *sparse* memmap into contiguous virtual address range
> starting from virt_memmap_start.
> 
> By this, pfn_to_page, page_to_pfn can be implemented as 
> #define pfn_to_page(pfn)		(virt_memmap_start + (pfn))
> #define page_to_pfn(pg)			(pg - virt_memmap_start)
> 
> 
> Difference from ia64's VIRTUAL_MEMMAP are
> * pfn_valid() uses sparsemem's logic.
> * memmap is allocated per SECTION_SIZE, so there will be some of RESERVED pages.
> * no holes in MAX_ORDER range. so HOLE_IN_ZONE=n here.

This is a good thing too as one of the main issues we've had with the
VIRTUAL_MEMMAP stuff is this need to pfn_valid each and every
conversion.  Of course the same change could be applied there just as well.

> 
> Todo
> - fix vmalloc() case in memory hotadd. (maybe __get_vm_area() can be used.)
> 
> Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
>  include/asm-generic/memory_model.h |    7 +++
>  include/linux/mmzone.h             |    8 +++
>  mm/Kconfig                         |    8 +++
>  mm/sparse.c                        |   85 +++++++++++++++++++++++++++++++++++--
>  4 files changed, 104 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6.19-rc2/mm/Kconfig
> ===================================================================
> --- linux-2.6.19-rc2.orig/mm/Kconfig	2006-10-18 18:13:39.000000000 +0900
> +++ linux-2.6.19-rc2/mm/Kconfig	2006-10-18 18:14:07.000000000 +0900
> @@ -77,6 +77,14 @@
>  	def_bool y
>  	depends on !SPARSEMEM
>  
> +config VMEMMAP_SPARSEMEM
> +	bool "memmap in virtual space"
> +	default y
> +	depends on SPARSEMEM && ARCH_VMEMMAP_SPARSEMEM_SUPPORT
> +	help
> +	  If this option is selected, you can speed up some kernel execution.
> +	  But this consumes large amount of virtual memory area in kernel.
> +
>  #
>  # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
>  # to represent different areas of memory.  This variable allows
> Index: linux-2.6.19-rc2/include/asm-generic/memory_model.h
> ===================================================================
> --- linux-2.6.19-rc2.orig/include/asm-generic/memory_model.h	2006-09-20 12:42:06.000000000 +0900
> +++ linux-2.6.19-rc2/include/asm-generic/memory_model.h	2006-10-18 18:14:07.000000000 +0900
> @@ -47,6 +47,7 @@
>  })
>  
>  #elif defined(CONFIG_SPARSEMEM)
> +#ifndef CONFIG_VMEMMAP_SPARSEMEM

Ok, this is a sub-type of sparsemem, we already have one called extreme
and that is called CONFIG_SPARSMEM_EXTREME so it seems sensible to stay
with this namespace, and call this CONFIG_SPARSEMEM_VMEMMAP.

>  /*
>   * Note: section's mem_map is encorded to reflect its start_pfn.
>   * section[i].section_mem_map == mem_map's address - start_pfn;
> @@ -62,6 +63,12 @@
>  	struct mem_section *__sec = __pfn_to_section(__pfn);	\
>  	__section_mem_map_addr(__sec) + __pfn;		\
>  })
> +#else /* CONFIG_VMEMMAP_SPARSEMEM */
> +
> +#define __pfn_to_page(pfn)	(virt_memmap_start + (pfn))
> +#define __page_to_pfn(pg)	((unsigned long)((pg) - virt_memmap_start))
> +
> +#endif /* CONFIG_VMEMMAP_SPARSEMEM */
>  #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */

Could we not leverage the standard infrastructure here.  It almost feels
like if __section_mem_map_addr just returned virt_memmap_start then
things would just come out the same with the compiler able to optimse
things away.  It would stop us having to change this above section which
would perhaps seem nicer?  I've not looked at all the other users of it
to see if that would defeat the rest of sparsemem, so I may be talking
out of my hat.

>  
>  #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
> Index: linux-2.6.19-rc2/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.19-rc2.orig/include/linux/mmzone.h	2006-10-18 18:13:39.000000000 +0900
> +++ linux-2.6.19-rc2/include/linux/mmzone.h	2006-10-18 18:14:07.000000000 +0900
> @@ -599,6 +599,14 @@
>  extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
>  #endif
>  
> +
> +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> +extern struct page *virt_memmap_start;
> +extern void init_vmemmap_sparsemem(void *addr);
> +#else
> +#define init_vmemmap_sparsemem(addr)	do{}while(0)
> +#endif
> +

The existing initialisation function for sparsemem is sparse_init().  It
seems that this one should follow the same scheme if we are part of
sparsemem.  sparse_vmemmap_init() perhaps, though as this is defining
the address of it perhaps, sparse_vmemmap_base() or
sparse_vmemmap_setbase().

>  static inline struct mem_section *__nr_to_section(unsigned long nr)
>  {
>  	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
> Index: linux-2.6.19-rc2/mm/sparse.c
> ===================================================================
> --- linux-2.6.19-rc2.orig/mm/sparse.c	2006-09-20 12:42:06.000000000 +0900
> +++ linux-2.6.19-rc2/mm/sparse.c	2006-10-19 16:58:06.000000000 +0900
> @@ -9,7 +9,81 @@
>  #include <linux/spinlock.h>
>  #include <linux/vmalloc.h>
>  #include <asm/dma.h>
> +#include <asm/pgalloc.h>
>  
> +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> +struct page *virt_memmap_start;
> +EXPORT_SYMBOL_GPL(virt_memmap_start);
> +
> +void init_vmemmap_sparsemem(void *start_addr)
> +{
> +	virt_memmap_start = start_addr;
> +}
> +
> +void *pte_alloc_vmemmap(int node)
> +{
> +	void *ret;
> +	if (system_state == SYSTEM_BOOTING) {
> +		ret = alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE);
> +	} else {
> +		ret = kmalloc_node(PAGE_SIZE, GFP_KERNEL, node);
> +		memset(ret, 0 , PAGE_SIZE);
> +	}
> +	BUG_ON(!ret);
> +	return ret;
> +}

Hmmm, this routine is not __init, but is calling an __init function.  I
assume its safe under the system_state switcheroo, but the tools will
barf about the difference.  Is there a way to mark this up as ok
(assuming it is).

> +/*
> + * At Hot-add, vmalloc'ed memmap will never call this.
> + * They have been already in suitable address.
> + * Called only when map is allocated by alloc_bootmem()/alloc_pages()

They will?  By who?  If they alloc one it has to be placed in the real
virtual map in VMEMAP mode else it won't be found by pfn_to_page and
family.  I assume I am missing the point of this comment.  Could you
explain more fully ...  Or perhaps this is a bit which is not right yet
as you do say in the heading that hotplug is not right?

> + */
> +static void map_virtual_memmap(unsigned long section, void *map, int node)
> +{
> +	unsigned long vmap_start, vmap_end, vmap;
> +	unsigned long pfn;
> +	void *pg;
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +
> +	BUG_ON (!virt_memmap_start);
> +
> +	pfn = section_nr_to_pfn(section);
> +	vmap_start = (unsigned long)(virt_memmap_start + pfn);
> +	vmap_end   = (unsigned long)(vmap_start + sizeof(struct page) * PAGES_PER_SECTION);
> +
> +	for (vmap = vmap_start; vmap < vmap_end; vmap += PAGE_SIZE, map += PAGE_SIZE)
> +	{
> +		pgd = pgd_offset_k(vmap);
> +		if (pgd_none(*pgd)) {
> +			pg = pte_alloc_vmemmap(node);
> +			pgd_populate(&init_mm, pgd, pg);
> +		}
> +		pud = pud_offset(pgd, vmap);
> +		if (pud_none(*pud)) {
> +			pg = pte_alloc_vmemmap(node);
> +			pud_populate(&init_mm, pud, pg);
> +		}
> +		pmd = pmd_offset(pud, vmap);
> +		if (pmd_none(*pmd)) {
> +			pg = pte_alloc_vmemmap(node);
> +			pmd_populate_kernel(&init_mm, pmd, pg);
> +		}
> +		pte = pte_offset_kernel(pmd, vmap);
> +		if (pte_none(*pte))
> +			set_pte(pte, pfn_pte(__pa(map) >> PAGE_SHIFT, PAGE_KERNEL));
> +	}
> +	return;
> +}

Its nice to see that this is generic as we can then add large page
support for instance where applicable.  Are there really no helpers in
the world to make this less 'wordy'.

We use this in the fault handler, are we using the above because we
want to assure numa locality of the allocations?  (Which would be valid.)

        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
                return VM_FAULT_OOM;
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
        pte = pte_alloc_map(mm, pmd, address);
        if (!pte)
                return VM_FAULT_OOM;


> +#else /* CONFIG_VMEMMAP_SPARSEMEM */
> +
> +static inline void map_virtual_memmap(unsigned long section, void *map, int nid)
> +{
> +	return;
> +}
> +
> +#endif /* CONFIG_VMEMMAP_SPARSEMEM */
>  /*
>   * Permanent SPARSEMEM data:
>   *
> @@ -175,13 +249,14 @@
>  }
>  
>  static int sparse_init_one_section(struct mem_section *ms,
> -		unsigned long pnum, struct page *mem_map)
> +		unsigned long pnum, struct page *mem_map, int nid)
>  {
>  	if (!valid_section(ms))
>  		return -EINVAL;
>  
>  	ms->section_mem_map &= ~SECTION_MAP_MASK;
>  	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
> +	map_virtual_memmap(pnum, mem_map, nid);

We seem to be using mem_map in sparse.c for the mem map, so perhaps this
should be map_virtual_mem_map(), or map_vmap_mem_map() or something?
>  
>  	return 1;
>  }
> @@ -214,10 +289,11 @@
>  	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
>  	if (page)
>  		goto got_map_page;
> -
> +#ifndef CONFIG_VMEMMAP_SPARSEMEM
>  	ret = vmalloc(memmap_size);
>  	if (ret)
>  		goto got_map_ptr;
> +#endif

I assume we need this because its not really a good thing to have pages
allocated which are already mapped as you are going to map them
elsewhere?  Yes?  This only seems to be used from hotplug, so I'll defer
to Dave.

>  
>  	return NULL;
>  got_map_page:
> @@ -261,7 +337,8 @@
>  		map = sparse_early_mem_map_alloc(pnum);
>  		if (!map)
>  			continue;
> -		sparse_init_one_section(__nr_to_section(pnum), pnum, map);
> +		sparse_init_one_section(__nr_to_section(pnum), pnum, map,
> +					sparse_early_nid(__nr_to_section(pnum)));
>  	}
>  }
>  
> @@ -296,7 +373,7 @@
>  	}
>  	ms->section_mem_map |= SECTION_MARKED_PRESENT;
>  
> -	ret = sparse_init_one_section(ms, section_nr, memmap);
> +	ret = sparse_init_one_section(ms, section_nr, memmap, zone->zone_pgdat->node_id);

In sparse_add_one_section() we already have the pgdat in a local, so
this would better be pgdat->node_id.

>  
>  out:
>  	pgdat_resize_unlock(pgdat, &flags);
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-19  8:21 [RFC] virtual memmap for sparsemem [1/2] arch independent part KAMEZAWA Hiroyuki
  2006-10-19 15:16 ` Andy Whitcroft
@ 2006-10-19 16:39 ` Christoph Lameter
  2006-10-20  1:18   ` KAMEZAWA Hiroyuki
  2006-11-21 11:37 ` Heiko Carstens
  2 siblings, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2006-10-19 16:39 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Linux-MM, linux-ia64

On Thu, 19 Oct 2006, KAMEZAWA Hiroyuki wrote:

> For make patches simple, pfn_valid() uses sparsemem's logic. 

Hmm... pfn_valid is much less costly if you use ia64's scheme. You can 
simply probe without having to walk tables.

> This patch maps sparsemem's *sparse* memmap into contiguous virtual address range
> starting from virt_memmap_start.

Could you make that a static address instead of a variable? Also we 
already have vmem_map (ia64 specific) and mem_map. The logic here is the 
same as FLATMEM. Why not use the definitions for FLATMEM?

> * memmap is allocated per SECTION_SIZE, so there will be some of RESERVED pages.
> * no holes in MAX_ORDER range. so HOLE_IN_ZONE=n here.

Good. Had a patch here to do the same but I do not have time to get to 
it. Surely wish that this will become the default config and that we can 
get rid of at least some of the memory models.

> +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> +extern struct page *virt_memmap_start;

extern struct page[] would be better performance wise. Use the definitions 
for FLATMEM?

> +		if (pte_none(*pte))
> +			set_pte(pte, pfn_pte(__pa(map) >> PAGE_SHIFT, PAGE_KERNEL));

Would it be possible to add support for larger page sizes? On x86_64 we 
probably would like to use 2MB pages and it may be good to have 
configurable page size on ia64.

The virtual memmap has the potential of becoming the default for x86_64 
and many other platforms that already map memory. There is no performance 
difference between FLATMEM and this virtual memmap approach if there are 
already mappings in play.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-19 15:16 ` Andy Whitcroft
@ 2006-10-19 16:43   ` Christoph Lameter
  2006-10-20  1:00   ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 13+ messages in thread
From: Christoph Lameter @ 2006-10-19 16:43 UTC (permalink / raw)
  To: Andy Whitcroft; +Cc: KAMEZAWA Hiroyuki, Linux-MM, linux-ia64, Dave Hansen

On Thu, 19 Oct 2006, Andy Whitcroft wrote:

> This is a good thing too as one of the main issues we've had with the
> VIRTUAL_MEMMAP stuff is this need to pfn_valid each and every
> conversion.  Of course the same change could be applied there just as well.

Well it should have been done instead of adding this strange hole logic.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-19 15:16 ` Andy Whitcroft
  2006-10-19 16:43   ` Christoph Lameter
@ 2006-10-20  1:00   ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-10-20  1:00 UTC (permalink / raw)
  To: Andy Whitcroft; +Cc: linux-mm, linux-ia64, haveblue

Hi,

On Thu, 19 Oct 2006 16:16:12 +0100
Andy Whitcroft <apw@shadowen.org> wrote:

> KAMEZAWA Hiroyuki wrote:
> >  #elif defined(CONFIG_SPARSEMEM)
> > +#ifndef CONFIG_VMEMMAP_SPARSEMEM
> 
> Ok, this is a sub-type of sparsemem, we already have one called extreme
> and that is called CONFIG_SPARSMEM_EXTREME so it seems sensible to stay
> with this namespace, and call this CONFIG_SPARSEMEM_VMEMMAP.
> 
looks better. I'll rename.


> > +#else /* CONFIG_VMEMMAP_SPARSEMEM */
> > +
> > +#define __pfn_to_page(pfn)	(virt_memmap_start + (pfn))
> > +#define __page_to_pfn(pg)	((unsigned long)((pg) - virt_memmap_start))
> > +
> > +#endif /* CONFIG_VMEMMAP_SPARSEMEM */
> >  #endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
> 
> Could we not leverage the standard infrastructure here.  It almost feels
> like if __section_mem_map_addr just returned virt_memmap_start then
> things would just come out the same with the compiler able to optimse
> things away.  It would stop us having to change this above section which
> would perhaps seem nicer?  I've not looked at all the other users of it
> to see if that would defeat the rest of sparsemem, so I may be talking
> out of my hat.
> 
Hm, Okay. I'll try it in the next time and check how it looks.


> > +
> > +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> > +extern struct page *virt_memmap_start;
> > +extern void init_vmemmap_sparsemem(void *addr);
> > +#else
> > +#define init_vmemmap_sparsemem(addr)	do{}while(0)
> > +#endif
> > +
> 
> The existing initialisation function for sparsemem is sparse_init().  It
> seems that this one should follow the same scheme if we are part of
> sparsemem.  sparse_vmemmap_init() perhaps, though as this is defining
> the address of it perhaps, sparse_vmemmap_base() or
> sparse_vmemmap_setbase().
> 
Okay. 
I have another idea, which Chiristoph mentioned, to make start address
of vmemmap to be constant value. If doing so, this call can be removed.


> > +void init_vmemmap_sparsemem(void *start_addr)
> > +{
> > +	virt_memmap_start = start_addr;
> > +}
> > +
> > +void *pte_alloc_vmemmap(int node)
> > +{
> > +	void *ret;
> > +	if (system_state == SYSTEM_BOOTING) {
> > +		ret = alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE);
> > +	} else {
> > +		ret = kmalloc_node(PAGE_SIZE, GFP_KERNEL, node);
> > +		memset(ret, 0 , PAGE_SIZE);
> > +	}
> > +	BUG_ON(!ret);
> > +	return ret;
> > +}
> 
> Hmmm, this routine is not __init, but is calling an __init function.  I
> assume its safe under the system_state switcheroo, but the tools will
> barf about the difference.  Is there a way to mark this up as ok
> (assuming it is).
Maybe my mistake is to handle booting case and memory-hot-add case in a patch.
And I'll add __init or __meminit to suitable place in the next time.

> 
> > +/*
> > + * At Hot-add, vmalloc'ed memmap will never call this.
> > + * They have been already in suitable address.
> > + * Called only when map is allocated by alloc_bootmem()/alloc_pages()
> 
> They will?  By who?  

> If they alloc one it has to be placed in the real
> virtual map in VMEMAP mode else it won't be found by pfn_to_page and
> family.  I assume I am missing the point of this comment.  Could you
> explain more fully ...  Or perhaps this is a bit which is not right yet
> as you do say in the heading that hotplug is not right?
> 

Sorry...What I wanted to say here was that vmalloced memmap by memory-hotplug
cannot be handled by this routine. 
I'll divide memory-hotplug case from this patch to make things clearer.


> > + */
> > +static void map_virtual_memmap(unsigned long section, void *map, int node)
> > +{
> > +	unsigned long vmap_start, vmap_end, vmap;
> > +	unsigned long pfn;
> > +	void *pg;
> > +	pgd_t *pgd;
> > +	pud_t *pud;
> > +	pmd_t *pmd;
> > +	pte_t *pte;
> > +
> > +	BUG_ON (!virt_memmap_start);
> > +
> > +	pfn = section_nr_to_pfn(section);
> > +	vmap_start = (unsigned long)(virt_memmap_start + pfn);
> > +	vmap_end   = (unsigned long)(vmap_start + sizeof(struct page) * PAGES_PER_SECTION);
> > +
> > +	for (vmap = vmap_start; vmap < vmap_end; vmap += PAGE_SIZE, map += PAGE_SIZE)
> > +	{
> > +		pgd = pgd_offset_k(vmap);
> > +		if (pgd_none(*pgd)) {
> > +			pg = pte_alloc_vmemmap(node);
> > +			pgd_populate(&init_mm, pgd, pg);
> > +		}
> > +		pud = pud_offset(pgd, vmap);
> > +		if (pud_none(*pud)) {
> > +			pg = pte_alloc_vmemmap(node);
> > +			pud_populate(&init_mm, pud, pg);
> > +		}
> > +		pmd = pmd_offset(pud, vmap);
> > +		if (pmd_none(*pmd)) {
> > +			pg = pte_alloc_vmemmap(node);
> > +			pmd_populate_kernel(&init_mm, pmd, pg);
> > +		}
> > +		pte = pte_offset_kernel(pmd, vmap);
> > +		if (pte_none(*pte))
> > +			set_pte(pte, pfn_pte(__pa(map) >> PAGE_SHIFT, PAGE_KERNEL));
> > +	}
> > +	return;
> > +}
> 
> Its nice to see that this is generic as we can then add large page
> support for instance where applicable.  Are there really no helpers in
> the world to make this less 'wordy'.
> 
> We use this in the fault handler, are we using the above because we
> want to assure numa locality of the allocations?  (Which would be valid.)
> 
yes, for NUMA allocation. and for using alloc_bootmem().


> >  	ms->section_mem_map &= ~SECTION_MAP_MASK;
> >  	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
> > +	map_virtual_memmap(pnum, mem_map, nid);
> 
> We seem to be using mem_map in sparse.c for the mem map, so perhaps this
> should be map_virtual_mem_map(), or map_vmap_mem_map() or something?
Okay, will rename.

> >  
> >  	return 1;
> >  }
> > @@ -214,10 +289,11 @@
> >  	page = alloc_pages(GFP_KERNEL, get_order(memmap_size));
> >  	if (page)
> >  		goto got_map_page;
> > -
> > +#ifndef CONFIG_VMEMMAP_SPARSEMEM
> >  	ret = vmalloc(memmap_size);
> >  	if (ret)
> >  		goto got_map_ptr;
> > +#endif
> 
> I assume we need this because its not really a good thing to have pages
> allocated which are already mapped as you are going to map them
> elsewhere?  Yes?  T
Yes.
> his only seems to be used from hotplug, so I'll defer to Dave.
I'll add hotplug handling later.

 
> > -	ret = sparse_init_one_section(ms, section_nr, memmap);
> > +	ret = sparse_init_one_section(ms, section_nr, memmap, zone->zone_pgdat->node_id);
> 
> In sparse_add_one_section() we already have the pgdat in a local, so
> this would better be pgdat->node_id.
> 
Okay, thanks.

Thank you for comments.
I'll refresh this patch.

Regards,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-19 16:39 ` Christoph Lameter
@ 2006-10-20  1:18   ` KAMEZAWA Hiroyuki
  2006-10-20  1:42     ` Christoph Lameter
  0 siblings, 1 reply; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-10-20  1:18 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-ia64

On Thu, 19 Oct 2006 09:39:55 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> On Thu, 19 Oct 2006, KAMEZAWA Hiroyuki wrote:
> 
> > For make patches simple, pfn_valid() uses sparsemem's logic. 
> 
> Hmm... pfn_valid is much less costly if you use ia64's scheme.
> You can simply probe without having to walk tables.
> 
Yes. but it seems to need per-arch implementation (in page fault handler).
like this (from ia64)
==
#ifdef CONFIG_VIRTUAL_MEM_MAP
        /*
         * If fault is in region 5 and we are in the kernel, we may already
         * have the mmap_sem (pfn_valid macro is called during mmap). There
         * is no vma for region 5 addr's anyway, so skip getting the semaphore
         * and go directly to the exception handling code.
         */

        if ((REGION_NUMBER(address) == 5) && !user_mode(regs))
                goto bad_area_no_up;
#endif
==

Maybe extra optimization patch can be discussed after this generic code is settled.


> > This patch maps sparsemem's *sparse* memmap into contiguous virtual address range
> > starting from virt_memmap_start.
> 
> Could you make that a static address instead of a variable? Also we 
> already have vmem_map (ia64 specific) and mem_map. The logic here is the 
> same as FLATMEM. Why not use the definitions for FLATMEM?
It depends on how #ifdef looks. Here, I just wanted to throw this stuff into
SPARSEMEM subsystem.

>  
> > * memmap is allocated per SECTION_SIZE, so there will be some of RESERVED pages.
> > * no holes in MAX_ORDER range. so HOLE_IN_ZONE=n here.
> 
> Good. Had a patch here to do the same but I do not have time to get to 
> it. Surely wish that this will become the default config and that we can 
> get rid of at least some of the memory models.
> 
> > +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> > +extern struct page *virt_memmap_start;
> 
> extern struct page[] would be better performance wise. Use the definitions 
> for FLATMEM?
Okay. will make it as array. or some constant value.

> 
> > +		if (pte_none(*pte))
> > +			set_pte(pte, pfn_pte(__pa(map) >> PAGE_SHIFT, PAGE_KERNEL));
> 
> Would it be possible to add support for larger page sizes? On x86_64 we 
> probably would like to use 2MB pages and it may be good to have 
> configurable page size on ia64.
> 
> The virtual memmap has the potential of becoming the default for x86_64 
> and many other platforms that already map memory. There is no performance 
> difference between FLATMEM and this virtual memmap approach if there are 
> already mappings in play.
> 
Hmm, adding CONFIG_HAVE_ARCH_LARGE_KERNEL_PAGE_MAPPING will be good ?
We can add per-arch patches afterwards.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-20  1:18   ` KAMEZAWA Hiroyuki
@ 2006-10-20  1:42     ` Christoph Lameter
  2006-10-20  2:06       ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2006-10-20  1:42 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-ia64

On Fri, 20 Oct 2006, KAMEZAWA Hiroyuki wrote:

> Yes. but it seems to need per-arch implementation (in page fault handler).
> like this (from ia64)

If we have a statically assigned virtual memory area then this is not a 
big problem. With sharing the VMALLOC address space this may be a problem. 
I think a static address space is no problem for 64 bit platforms where 
we have lots of virtual address space. 32 bit platforms may have a dense 
address space where vmemmap is not needed. 
Maybe switch to a static address range range ? You saw my ia64 patch 
that did this right?


> Maybe extra optimization patch can be discussed after this generic code is settled.

Ok.

> > > +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> > > +extern struct page *virt_memmap_start;
> > 
> > extern struct page[] would be better performance wise. Use the definitions 
> > for FLATMEM?
> Okay. will make it as array. or some constant value.

See my IA64 patchset for vmemmap static. We could define the mem_map 
address statically in the linker.

> > The virtual memmap has the potential of becoming the default for x86_64 
> > and many other platforms that already map memory. There is no performance 
> > difference between FLATMEM and this virtual memmap approach if there are 
> > already mappings in play.
> > 
> Hmm, adding CONFIG_HAVE_ARCH_LARGE_KERNEL_PAGE_MAPPING will be good ?
> We can add per-arch patches afterwards.

Great!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-20  1:42     ` Christoph Lameter
@ 2006-10-20  2:06       ` KAMEZAWA Hiroyuki
  2006-10-20  2:26         ` Christoph Lameter
  0 siblings, 1 reply; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-10-20  2:06 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, linux-ia64

On Thu, 19 Oct 2006 18:42:24 -0700 (PDT)
Christoph Lameter <clameter@sgi.com> wrote:

> On Fri, 20 Oct 2006, KAMEZAWA Hiroyuki wrote:
> 
> > Yes. but it seems to need per-arch implementation (in page fault handler).
> > like this (from ia64)
> 
> If we have a statically assigned virtual memory area then this is not a 
> big problem. With sharing the VMALLOC address space this may be a problem. 
> I think a static address space is no problem for 64 bit platforms where 
> we have lots of virtual address space. 32 bit platforms may have a dense 
> address space where vmemmap is not needed. 
> Maybe switch to a static address range range ? You saw my ia64 patch 
> that did this right?
> 
I'll study it.

> > > > +#ifdef CONFIG_VMEMMAP_SPARSEMEM
> > > > +extern struct page *virt_memmap_start;
> > > 
> > > extern struct page[] would be better performance wise. Use the definitions 
> > > for FLATMEM?
> > Okay. will make it as array. or some constant value.
> 
> See my IA64 patchset for vmemmap static. We could define the mem_map 
> address statically in the linker.
> 
Ok.

> > > The virtual memmap has the potential of becoming the default for x86_64 
> > > and many other platforms that already map memory. There is no performance 
> > > difference between FLATMEM and this virtual memmap approach if there are 
> > > already mappings in play.
> > > 
> > Hmm, adding CONFIG_HAVE_ARCH_LARGE_KERNEL_PAGE_MAPPING will be good ?
> > We can add per-arch patches afterwards.
> 
> Great!

By the way, we have to make sizeof(struct page) as (1 << x) aligned size to use
large-sized page. (IIRC, my gcc-3.4.3 says it is 56bytes....)

-Kame






--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-20  2:06       ` KAMEZAWA Hiroyuki
@ 2006-10-20  2:26         ` Christoph Lameter
  2006-10-20 16:20           ` Luck, Tony
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Lameter @ 2006-10-20  2:26 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-ia64

On Fri, 20 Oct 2006, KAMEZAWA Hiroyuki wrote:

> By the way, we have to make sizeof(struct page) as (1 << x) aligned size to use
> large-sized page. (IIRC, my gcc-3.4.3 says it is 56bytes....)

Having it 1 << x would be useful to simplify the pfn_valid check but 
you can also check the start and the end of the page struct to allow the 
page struct cross a page boundary. See the ia64 virtual memmap 
implementation of pfn_valid.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-20  2:26         ` Christoph Lameter
@ 2006-10-20 16:20           ` Luck, Tony
  0 siblings, 0 replies; 13+ messages in thread
From: Luck, Tony @ 2006-10-20 16:20 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: KAMEZAWA Hiroyuki, linux-mm, linux-ia64

On Thu, Oct 19, 2006 at 07:26:33PM -0700, Christoph Lameter wrote:
> On Fri, 20 Oct 2006, KAMEZAWA Hiroyuki wrote:
> 
> > By the way, we have to make sizeof(struct page) as (1 << x) aligned size to use
> > large-sized page. (IIRC, my gcc-3.4.3 says it is 56bytes....)
> 
> Having it 1 << x would be useful to simplify the pfn_valid check but 
> you can also check the start and the end of the page struct to allow the 
> page struct cross a page boundary. See the ia64 virtual memmap 
> implementation of pfn_valid.

Rounding up sizeof(struct page) to a power of two would have to provide
a huge benefit somewhere to outweigh the cost of doing so.  With a 16K
page size there are 64K pages/gigabyte ... so adding an 8 byte pad now would
waste an extra 0.5M per gigabyte of memory (which adds up to 2G on SGI's
monster 4TB machines).  That's pretty bad ... but if we ever added anything
new to struct page and pushed it just over 64bytes, it would be a complete
disaster to round up to 128!!!

Listen to Christoph.  Check the start and end address of the page struct in
pfn_valid.

-Tony

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-10-19  8:21 [RFC] virtual memmap for sparsemem [1/2] arch independent part KAMEZAWA Hiroyuki
  2006-10-19 15:16 ` Andy Whitcroft
  2006-10-19 16:39 ` Christoph Lameter
@ 2006-11-21 11:37 ` Heiko Carstens
  2006-11-21 12:19   ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 13+ messages in thread
From: Heiko Carstens @ 2006-11-21 11:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Linux-MM, linux-ia64, Martin Schwidefsky

On Thu, Oct 19, 2006 at 05:21:40PM +0900, KAMEZAWA Hiroyuki wrote:
> This is a patch for virtual memmap on sparsemem against 2.6.19-rc2.
> booted well on my Tiger4.
> 
> In this time, this is just a RFC. comments on patch and advises for benchmarking
> is welcome. (memory hotplug case is not well handled yet.)
> 
> ia64's SPARSEMEM uses SPARSEMEM_EXTREME. This requires 2-level table lookup by
> software for page_to_pfn()/pfn_to_page(). virtual memmap can remove that costs.
> But will consume more TLBs.
> 
> For make patches simple, pfn_valid() uses sparsemem's logic. 
> 
> - Kame
> ==
> This patch maps sparsemem's *sparse* memmap into contiguous virtual address range
> starting from virt_memmap_start.
> 
> By this, pfn_to_page, page_to_pfn can be implemented as 
> #define pfn_to_page(pfn)		(virt_memmap_start + (pfn))
> #define page_to_pfn(pg)			(pg - virt_memmap_start)
> 
> 
> Difference from ia64's VIRTUAL_MEMMAP are
> * pfn_valid() uses sparsemem's logic.
> * memmap is allocated per SECTION_SIZE, so there will be some of RESERVED pages.
> * no holes in MAX_ORDER range. so HOLE_IN_ZONE=n here.
> 
> Todo
> - fix vmalloc() case in memory hotadd. (maybe __get_vm_area() can be used.)

Better late than never, but here is a reply as well :)

Is this supposed to replace ia64's vmem_map?
I'm asking because on s390 we need a vmem_map too, but don't want to be
limited by the sparsemem restrictions (especially SECTION_SIZE that is).
In addition we have a shared memory device driver (dcss) with which it
is possible to attach some shared memory. Because of that it is
necessary to be able to add some additional struct pages on-the-fly.
This is not very different to memory hotplug; I think it's even easier,
since all we need are some initialized struct pages.

Currently I have a working prototype that does all that but still needs
a lot of cleanup and some error handling. It is (of course) heavily
inspired by ia64's vmem_map implementation.

I'd love to go for a generic implementation, but if that is based on
sparsemem it doesn't make too much sense on s390.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-11-21 12:19   ` KAMEZAWA Hiroyuki
@ 2006-11-21 12:19     ` Heiko Carstens
  0 siblings, 0 replies; 13+ messages in thread
From: Heiko Carstens @ 2006-11-21 12:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, linux-ia64, schwidefsky

> > I'd love to go for a generic implementation, but if that is based on
> > sparsemem it doesn't make too much sense on s390.
> 
> 'What type of vmem_map is supported ?' is maybe per-arch decision not generic.
> If people dislikes Flat/Discontig/Sparsemem complication, some clean
> up patch will be posted and discussion will start. If not, nothing will happen.

Ok, I will work on the s390 arch specific patch and post it here. Maybe it's
worth adding a generic vmem_map interface, maybe not. We'll see.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC] virtual memmap for sparsemem [1/2] arch independent part
  2006-11-21 11:37 ` Heiko Carstens
@ 2006-11-21 12:19   ` KAMEZAWA Hiroyuki
  2006-11-21 12:19     ` Heiko Carstens
  0 siblings, 1 reply; 13+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-11-21 12:19 UTC (permalink / raw)
  To: Heiko Carstens; +Cc: linux-mm, linux-ia64, schwidefsky

On Tue, 21 Nov 2006 12:37:08 +0100
Heiko Carstens <heiko.carstens@de.ibm.com> wrote:

> > Todo
> > - fix vmalloc() case in memory hotadd. (maybe __get_vm_area() can be used.)
> 
> Better late than never, but here is a reply as well :)
> 
Thank you for comment.
I'm now stopping this because of piles of user troubles and Excels and Words ;)

> Is this supposed to replace ia64's vmem_map?
No. my aim is just to speed-up sparsemem. ia64/sparsemem's page_to_pfn()
pfn_to_page() is costly.

> I'm asking because on s390 we need a vmem_map too, but don't want to be
> limited by the sparsemem restrictions (especially SECTION_SIZE that is).
> In addition we have a shared memory device driver (dcss) with which it
> is possible to attach some shared memory. Because of that it is
> necessary to be able to add some additional struct pages on-the-fly.
> This is not very different to memory hotplug; I think it's even easier,
> since all we need are some initialized struct pages.
> 
> Currently I have a working prototype that does all that but still needs
> a lot of cleanup and some error handling. It is (of course) heavily
> inspired by ia64's vmem_map implementation.
> 
> I'd love to go for a generic implementation, but if that is based on
> sparsemem it doesn't make too much sense on s390.

'What type of vmem_map is supported ?' is maybe per-arch decision not generic.
If people dislikes Flat/Discontig/Sparsemem complication, some clean
up patch will be posted and discussion will start. If not, nothing will happen.

-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2006-11-21 12:19 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2006-10-19  8:21 [RFC] virtual memmap for sparsemem [1/2] arch independent part KAMEZAWA Hiroyuki
2006-10-19 15:16 ` Andy Whitcroft
2006-10-19 16:43   ` Christoph Lameter
2006-10-20  1:00   ` KAMEZAWA Hiroyuki
2006-10-19 16:39 ` Christoph Lameter
2006-10-20  1:18   ` KAMEZAWA Hiroyuki
2006-10-20  1:42     ` Christoph Lameter
2006-10-20  2:06       ` KAMEZAWA Hiroyuki
2006-10-20  2:26         ` Christoph Lameter
2006-10-20 16:20           ` Luck, Tony
2006-11-21 11:37 ` Heiko Carstens
2006-11-21 12:19   ` KAMEZAWA Hiroyuki
2006-11-21 12:19     ` Heiko Carstens

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox