linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: linux-mm@kvack.org, mel@skynet.ie, clameter@engr.sgi.com,
	akpm@linux-foundation.org
Subject: [RFC} memory unplug patchset prep [9/16] create movable zone at boot
Date: Tue, 6 Mar 2007 13:52:32 +0900	[thread overview]
Message-ID: <20070306135232.42a55807.kamezawa.hiroyu@jp.fujitsu.com> (raw)
In-Reply-To: <20070306133223.5d610daf.kamezawa.hiroyu@jp.fujitsu.com>

This patch adds codes for creating movable zones.

Add 2 kernel paramers.
- kernel_core_pages=XXX[KMG]
- kernel_core_ratio=xx

When kernel_core_pages is specified, create zone(s) for not-movable pages
from lower address and make the amount of it to specified size.
Maybe good for non-NUMA environment and node-hot-remove.

When kernel_core_ratio is specified, create zone(s) for not-movable pages
on each node. The amount of not-movable-zone is calucated as

 memory_on_node * kernel_core_ratio/100.

Maybe good for NUMA environment and just want to use MOVABLE zone.

Note:
Changes to zone_spanned_pages_in_node()/absent_pages_in_node() looks ugly...
And, this boot option is just a sample. I'll change this when I find a better
way to go.

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

---
 Documentation/kernel-parameters.txt |   11 ++
 include/linux/mmzone.h              |    3 
 mm/page_alloc.c                     |  198 +++++++++++++++++++++++++++++++++---
 3 files changed, 199 insertions(+), 13 deletions(-)

Index: devel-tree-2.6.20-mm2/mm/page_alloc.c
===================================================================
--- devel-tree-2.6.20-mm2.orig/mm/page_alloc.c
+++ devel-tree-2.6.20-mm2/mm/page_alloc.c
@@ -137,12 +137,16 @@ static unsigned long __initdata dma_rese
   int __initdata nr_nodemap_entries;
   unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __initdata lowest_movable_pfn[MAX_NUMNODES];
+  unsigned long kernel_core_ratio;
+  unsigned long kernel_core_pages;
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
   unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
   unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -2604,6 +2608,8 @@ void __init get_pfn_range_for_nid(unsign
  */
 unsigned long __init zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
+					unsigned long *start_pfn,
+					unsigned long *end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
@@ -2611,8 +2617,30 @@ unsigned long __init zone_spanned_pages_
 
 	/* Get the start and end of the node and zone */
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+	if (start_pfn)
+		*start_pfn = 0;
+	if (end_pfn)
+		*end_pfn = 0;
+	if (!is_configured_zone(ZONE_MOVABLE) ||
+		   lowest_movable_pfn[nid] == 0) {
+		/* we don't use ZONE_MOVABLE */
+		zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+		zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+	} else if (zone_type == ZONE_MOVABLE) {
+		zone_start_pfn = lowest_movable_pfn[nid];
+		zone_end_pfn = node_end_pfn;
+	} else {
+		/* adjust range to lowest_movable_pfn[] */
+		zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+		zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+
+		if (zone_start_pfn >= lowest_movable_pfn[nid])
+			return 0;
+		zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+		zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+		if (zone_end_pfn > lowest_movable_pfn[nid])
+			zone_end_pfn = lowest_movable_pfn[nid];
+	}
 
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
@@ -2621,8 +2649,11 @@ unsigned long __init zone_spanned_pages_
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
-
 	/* Return the spanned pages */
+	if (start_pfn)
+		*start_pfn = zone_start_pfn;
+	if (end_pfn)
+		*end_pfn = zone_end_pfn;
 	return zone_end_pfn - zone_start_pfn;
 }
 
@@ -2692,16 +2723,24 @@ unsigned long __init absent_pages_in_ran
 /* Return the number of page frames in holes in a zone on a node */
 unsigned long __init zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
+					unsigned long start,
+					unsigned long end,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
-							node_start_pfn);
-	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
-							node_end_pfn);
+	if (start == 0 && end == 0) {
+		zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+								node_start_pfn);
+		zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+								node_end_pfn);
+	} else {
+		/* ZONE_MOVABLE always use passed params */
+		zone_start_pfn = start;
+		zone_end_pfn = end;
+	}
 
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
@@ -2709,13 +2748,22 @@ unsigned long __init zone_absent_pages_i
 #else
 static inline unsigned long zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
+					unsigned long *start_pfn,
+					unsigned long *end_pfn,
 					unsigned long *zones_size)
 {
+	/* this will not be used by caller*/
+	if (start_pfn)
+		*start_pfn = 0;
+	if (end_pfn)
+		*end_pfn = 0;
 	return zones_size[zone_type];
 }
 
 static inline unsigned long zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
+						unsigned long start,
+						unsigned long end,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
@@ -2733,20 +2781,115 @@ static void __init calculate_node_totalp
 	enum zone_type i;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
-		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
+		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, NULL, NULL,
 								zones_size);
 	pgdat->node_spanned_pages = totalpages;
 
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
-			zone_absent_pages_in_node(pgdat->node_id, i,
+			zone_absent_pages_in_node(pgdat->node_id, i, 0, 0,
 								zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 
+#ifdef CONFIG_ZONE_MOVABLE
+
+unsigned long calc_zone_alignment(unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	return (pfn + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK;
+#else
+	return (pfn + MAX_ORDER_NR_PAGES - 1) & ~(MAX_ORDER_NR_PAGES - 1)
+#endif
+}
+
+
+static void alloc_core_pages_from_low(void)
+{
+	unsigned long nr_pages, start_pfn, end_pfn, pfn;
+	int i, nid;
+	long kcore_pages = kernel_core_pages;
+	for_each_online_node(nid) {
+		for_each_active_range_index_in_nid(i, nid) {
+			start_pfn = early_node_map[i].start_pfn;
+			end_pfn = early_node_map[i].end_pfn;
+			nr_pages = end_pfn - start_pfn;
+			if (nr_pages > kcore_pages) {
+				pfn = start_pfn + kcore_pages;
+				pfn = calc_zone_alignment(pfn);
+				if (pfn < end_pfn) {
+					lowest_movable_pfn[nid] = pfn;
+					kcore_pages = 0;
+					break;
+				} else {
+					kcore_pages = 0;
+				}
+			} else {
+				kcore_pages -= nr_pages;
+			}
+		}
+	}
+	return;
+}
+
+static void split_movable_pages(void)
+{
+	int i, nid;
+	unsigned long total_pages, nr_pages, start_pfn, end_pfn, pfn;
+	long core;
+	for_each_online_node(nid) {
+		lowest_movable_pfn[nid] = 0;
+		pfn = 0;
+		total_pages = 0;
+		for_each_active_range_index_in_nid(i, nid) {
+			start_pfn = early_node_map[i].start_pfn;
+			end_pfn = early_node_map[i].end_pfn;
+			total_pages += end_pfn - start_pfn;
+		}
+		core = total_pages * kernel_core_ratio/100;
+		for_each_active_range_index_in_nid(i, nid) {
+			start_pfn = early_node_map[i].start_pfn;
+			end_pfn = early_node_map[i].end_pfn;
+			nr_pages = end_pfn - start_pfn;
+			if (nr_pages > core) {
+				pfn = start_pfn + core;
+				pfn = calc_zone_alignment(pfn);
+				if (pfn < end_pfn) {
+					lowest_movable_pfn[nid] = pfn;
+					break;
+				} else {
+					core -= nr_pages;
+					if (core < 0)
+						core = 0;
+				}
+			} else {
+				core -= nr_pages;
+			}
+		}
+	}
+	return;
+}
+
+
+static void reserve_movable_pages(void)
+{
+	memset(lowest_movable_pfn, 0, MAX_NUMNODES);
+	if (kernel_core_pages) {
+		alloc_core_pages_from_low();
+	} else if (kernel_core_ratio) {
+		split_movable_pages();
+	}
+	return;
+}
+#else
+static void reserve_movable_pages(void)
+{
+	return;
+}
+#endif
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -2768,10 +2911,10 @@ static void __meminit free_area_init_cor
 	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
-		unsigned long size, realsize, memmap_pages;
+		unsigned long size, realsize, memmap_pages, start, end;
 
-		size = zone_spanned_pages_in_node(nid, j, zones_size);
-		realsize = size - zone_absent_pages_in_node(nid, j,
+		size = zone_spanned_pages_in_node(nid, j, &start, &end, zones_size);
+		realsize = size - zone_absent_pages_in_node(nid, j, start, end,
 								zholes_size);
 
 		/*
@@ -3065,6 +3208,7 @@ unsigned long __init find_max_pfn_with_a
 	return max_pfn;
 }
 
+
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
@@ -3127,6 +3271,8 @@ void __init free_area_init_nodes(unsigne
 
 	/* Initialise every node */
 	setup_nr_node_ids();
+	/* setup movable pages */
+	reserve_movable_pages();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, pgdat, NULL,
@@ -3542,6 +3688,33 @@ void *__init alloc_large_system_hash(con
 	return table;
 }
 
+#ifdef CONFIG_ZONE_MOVABLE
+
+char * __init parse_kernel_core_pages(char *p)
+{
+	unsigned long long coremem;
+	if (!p)
+		return NULL;
+	coremem = memparse(p, &p);
+	kernel_core_pages = coremem >> PAGE_SHIFT;
+	return p;
+}
+
+char * __init parse_kernel_core_ratio(char *p)
+{
+	int ratio[1];
+	ratio[0] = 0;
+	if (!p)
+		return NULL;
+	p = get_options(p, 1, ratio);
+	if (ratio[0])
+		kernel_core_ratio = ratio[0];
+	if (kernel_core_ratio > 100)
+		kernel_core_ratio = 0; /* ll memory is not movable */
+	return p;
+}
+#endif /* CONFIG_ZONE_MOVABLE */
+
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
 struct page *pfn_to_page(unsigned long pfn)
 {
@@ -3555,4 +3728,3 @@ EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 
-
Index: devel-tree-2.6.20-mm2/Documentation/kernel-parameters.txt
===================================================================
--- devel-tree-2.6.20-mm2.orig/Documentation/kernel-parameters.txt
+++ devel-tree-2.6.20-mm2/Documentation/kernel-parameters.txt
@@ -764,6 +764,17 @@ and is between 256 and 4096 characters. 
 
 	keepinitrd	[HW,ARM]
 
+	kernel_core_pages=nn[KMG] [KNL, BOOT] divide the whole memory into
+			not-movable and movable. movable memory can be
+			used only for page cache and user data. This option
+			specifies the amount of not-movable pages, called core
+			pages. core pages are allocated from the lower address.
+
+	kernel_core_ratio=nn [KND, BOOT] specifies the amount of the core
+			pages(see kernel_core_pages) by the ratio against
+			total memory. If NUMA, core pages are allocated for
+			each node by this ratio. "0" is not allowed.
+
 	kstack=N	[IA-32,X86-64] Print N words from the kernel stack
 			in oops dumps.
 
Index: devel-tree-2.6.20-mm2/include/linux/mmzone.h
===================================================================
--- devel-tree-2.6.20-mm2.orig/include/linux/mmzone.h
+++ devel-tree-2.6.20-mm2/include/linux/mmzone.h
@@ -608,6 +608,9 @@ int sysctl_min_unmapped_ratio_sysctl_han
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 
+extern char* parse_kernel_core_pages(char *cp);
+extern char*  parse_kernel_core_ratio(char *cp);
+
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
 #ifndef numa_node_id

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2007-03-06  4:52 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-06  4:32 [RFC} memory unplug patchset prep [0/16] KAMEZAWA Hiroyuki
2007-03-06  4:42 ` [RFC} memory unplug patchset prep [1/16] zone ids cleanup KAMEZAWA Hiroyuki
2007-03-06 15:36   ` David Rientjes
2007-03-07  1:21     ` KAMEZAWA Hiroyuki
2007-03-06  4:43 ` [RFC} memory unplug patchset prep [2/16] gathering alloc_zeroed_user_highpage() KAMEZAWA Hiroyuki
2007-03-06 15:54   ` David Rientjes
2007-03-07  1:46     ` KAMEZAWA Hiroyuki
2007-03-06  4:44 ` [RFC} memory unplug patchset prep [3/16] define is_identity_mapped KAMEZAWA Hiroyuki
2007-03-06 15:55   ` David Rientjes
2007-03-07  1:48     ` KAMEZAWA Hiroyuki
2007-03-06  4:45 ` [RFC} memory unplug patchset prep [4/16] ZONE_MOVABLE KAMEZAWA Hiroyuki
2007-03-06 16:06   ` David Rientjes
2007-03-07  1:51     ` KAMEZAWA Hiroyuki
2007-03-06  4:47 ` [RFC} memory unplug patchset prep [5/16] GFP_MOVABLE KAMEZAWA Hiroyuki
2007-03-06  4:48 ` [RFC} memory unplug patchset prep [6/16] alloc_zeroed_user_high_movable KAMEZAWA Hiroyuki
2007-03-06  4:49 ` [RFC} memory unplug patchset prep [7/16] change caller's gfp_mask KAMEZAWA Hiroyuki
2007-03-06  4:50 ` [RFC} memory unplug patchset prep [8/16] counter for ZONE_MOVABLE KAMEZAWA Hiroyuki
2007-03-06 16:11   ` David Rientjes
2007-03-07  1:55     ` KAMEZAWA Hiroyuki
2007-03-06  4:52 ` KAMEZAWA Hiroyuki [this message]
2007-03-06 16:06   ` [RFC} memory unplug patchset prep [9/16] create movable zone at boot David Rientjes
2007-03-07  2:02     ` KAMEZAWA Hiroyuki
2007-03-06  4:53 ` [RFC} memory unplug patchset prep [10/16] ia64 support KAMEZAWA Hiroyuki
2007-03-06  4:55 ` [RFC} memory unplug patchset prep [11/16] page isolation core KAMEZAWA Hiroyuki
2007-03-06  4:56 ` [RFC} memory unplug patchset prep [12/16] drain all pages KAMEZAWA Hiroyuki
2007-03-06  4:57 ` [RFC} memory unplug patchset prep [13/16] isolate freed pages KAMEZAWA Hiroyuki
2007-03-06  4:59 ` [RFC} memory unplug patchset prep [14/16] memory unplug core KAMEZAWA Hiroyuki
2007-03-06  5:00 ` [RFC} memory unplug patchset prep [15/16] hot-unplug interface for ia64 KAMEZAWA Hiroyuki
2007-03-06  5:02 ` [RFC} memory unplug patchset prep [16/16] migration nocontext KAMEZAWA Hiroyuki
2007-03-06 15:24 ` [RFC} memory unplug patchset prep [0/16] David Rientjes
2007-03-07  2:24   ` KAMEZAWA Hiroyuki
2007-03-07  2:31     ` David Rientjes
2007-03-07  2:44       ` KAMEZAWA Hiroyuki
2007-03-07 19:44       ` Mark Gross

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070306135232.42a55807.kamezawa.hiroyu@jp.fujitsu.com \
    --to=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=akpm@linux-foundation.org \
    --cc=clameter@engr.sgi.com \
    --cc=linux-mm@kvack.org \
    --cc=mel@skynet.ie \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox