[patch 1/4] pcp: zonequeues

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [patch 1/4] pcp: zonequeues
@ 2005-04-09 13:23 Nick Piggin
  2005-04-09 13:24 ` [patch 2/4] pcp: dynamic lists Nick Piggin
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Nick Piggin @ 2005-04-09 13:23 UTC (permalink / raw)
  To: Jack Steiner, Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 1162 bytes --]

Hi Jack,
Was thinking about some problems in this area, and I hacked up
a possible implementation to improve things.

1/4 switches the per cpu pagesets in struct zone to a single list
of zone pagesets for each CPU.

2/4 changes the per cpu list of pagesets to a list of pointers to
pagesets, and allocates them dynamically.

3/4 changes the code to allow NULL pagesets. In that case, a single
per-zone pageset is used, which is protected by the zone's spinlock.

4/4 changes setup so non local zones don't have associated pagesets.

It still needs some work - in particular, many NUMA systems probably
don't want this. I guess benchmarks should be done, and maybe we
could look at disabling the overhead of 3/4 and functional change of
4/4 depending on a CONFIG_ option.

Also, you say you might want "close" remote nodes to have pagesets,
but 4/4 only does local nodes. I added a comment with patch 4/4
marked with XXX which should allow you to do this quite easily.

Not tested (only compiled) on a NUMA system, but the NULL pagesets
logic appears to work OK. Boots on a small UMA SMP system. So just
be careful with it.

Comments?

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: pcp-zonequeues.patch --]
[-- Type: text/plain, Size: 12441 bytes --]

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/include/linux/mmzone.h	2005-04-09 22:44:48.000000000 +1000
@@ -53,14 +53,15 @@ struct per_cpu_pages {
 
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
-#ifdef CONFIG_NUMA
+};
+
+struct per_cpu_zone_stats {
 	unsigned long numa_hit;		/* allocated in intended node */
 	unsigned long numa_miss;	/* allocated in non intended node */
 	unsigned long numa_foreign;	/* was intended here, hit elsewhere */
 	unsigned long interleave_hit; 	/* interleaver prefered this zone */
 	unsigned long local_node;	/* allocation from local node */
 	unsigned long other_node;	/* allocation from other node */
-#endif
 } ____cacheline_aligned_in_smp;
 
 #define ZONE_DMA		0
@@ -113,16 +114,19 @@ struct zone {
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
 	/*
-	 * We don't know if the memory that we're going to allocate will be freeable
-	 * or/and it will be released eventually, so to avoid totally wasting several
-	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
-	 * to run OOM on the lower zones despite there's tons of freeable ram
-	 * on the higher zones). This array is recalculated at runtime if the
-	 * sysctl_lowmem_reserve_ratio sysctl changes.
+	 * We don't know if the memory that we're going to allocate will be
+	 * freeable or/and it will be released eventually, so to avoid totally
+	 * wasting several GB of ram we must reserve some of the lower zone
+	 * memory (otherwise we risk to run OOM on the lower zones despite
+	 * there's tons of freeable ram on the higher zones). This array is
+	 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
+	 * changes.
 	 */
 	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 
-	struct per_cpu_pageset	pageset[NR_CPUS];
+#ifdef CONFIG_NUMA
+	struct per_cpu_zone_stats stats[NR_CPUS];
+#endif
 
 	/*
 	 * free areas of different sizes
@@ -220,6 +224,8 @@ struct zone {
  */
 #define DEF_PRIORITY 12
 
+#define TOTAL_ZONES (MAX_NUMNODES * MAX_NR_ZONES)
+
 /*
  * One allocation request operates on a zonelist. A zonelist
  * is a list of zones, the first one is the 'goal' of the
@@ -232,10 +238,9 @@ struct zone {
  * footprint of this construct is very small.
  */
 struct zonelist {
-	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
+	struct zone *zones[TOTAL_ZONES + 1]; // NULL delimited
 };
 
-
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
  * (mostly NUMA machines?) to denote a higher-level memory zone than the
@@ -275,6 +280,7 @@ void __get_zone_counts(unsigned long *ac
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free);
 void build_all_zonelists(void);
+void build_percpu_pagelists(void);
 void wakeup_kswapd(struct zone *zone, int order);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int alloc_type, int can_try_harder, int gfp_high);
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2005-04-09 22:44:55.000000000 +1000
@@ -69,6 +69,28 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
 EXPORT_SYMBOL(zone_table);
 
+struct zone_pagesets {
+	struct per_cpu_pageset p[TOTAL_ZONES];
+};
+
+#define this_zone_pagesets()	(&__get_cpu_var(zone_pagesets))
+#define cpu_zone_pagesets(cpu)	(&per_cpu(zone_pagesets, (cpu)))
+
+#define zone_pagesets_idx(zone)		\
+	(NODEZONE((zone)->zone_pgdat->node_id, zone_idx(zone)))
+
+#define zone_pageset(zp, zone)		\
+	(&zp->p[zone_pagesets_idx(zone)])
+
+/*
+ * List of pointers to per_cpu_pagesets for each zone.
+ * XXX: put this comment in a future patch that actually enables NULLs here
+ * It is used as a per-CPU set. A value of NULL in any pointer indicates
+ * this CPU doesn't have a pageset for this zone, and should use the public
+ * pageset.
+ */
+static DEFINE_PER_CPU(struct zone_pagesets, zone_pagesets);
+
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
@@ -512,13 +534,14 @@ static int rmqueue_bulk(struct zone *zon
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
+	struct zone_pagesets *zp = cpu_zone_pagesets(cpu);
 	struct zone *zone;
 	int i;
 
+	/* XXX: this can be a for i = 0 .. TOTAL_ZONES loop */
 	for_each_zone(zone) {
-		struct per_cpu_pageset *pset;
+		struct per_cpu_pageset *pset = zone_pageset(zp, zone);
 
-		pset = &zone->pageset[cpu];
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 
@@ -577,21 +600,22 @@ static void zone_statistics(struct zonel
 	int cpu;
 	pg_data_t *pg = z->zone_pgdat;
 	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
-	struct per_cpu_pageset *p;
+	struct per_cpu_zone_stats *stats;
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	p = &z->pageset[cpu];
+	stats = &z->stats[cpu];
+
 	if (pg == orig) {
-		z->pageset[cpu].numa_hit++;
+		stats->numa_hit++;
 	} else {
-		p->numa_miss++;
-		zonelist->zones[0]->pageset[cpu].numa_foreign++;
+		stats->numa_miss++;
+		zonelist->zones[0]->stats[cpu].numa_foreign++;
 	}
 	if (pg == NODE_DATA(numa_node_id()))
-		p->local_node++;
+		stats->local_node++;
 	else
-		p->other_node++;
+		stats->other_node++;
 	local_irq_restore(flags);
 #endif
 }
@@ -602,6 +626,7 @@ static void zone_statistics(struct zonel
 static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
+	struct zone_pagesets *zp;
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
@@ -613,14 +638,17 @@ static void fastcall free_hot_cold_page(
 	if (PageAnon(page))
 		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
-	pcp = &zone->pageset[get_cpu()].pcp[cold];
+
+	preempt_disable();
+	zp = this_zone_pagesets();
+	pcp = &zone_pageset(zp, zone)->pcp[cold];
 	local_irq_save(flags);
 	if (pcp->count >= pcp->high)
 		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
 	local_irq_restore(flags);
-	put_cpu();
+	preempt_enable();
 }
 
 void fastcall free_hot_page(struct page *page)
@@ -655,9 +683,13 @@ buffered_rmqueue(struct zone *zone, int 
 	int cold = !!(gfp_flags & __GFP_COLD);
 
 	if (order == 0) {
+		struct zone_pagesets *zp;
 		struct per_cpu_pages *pcp;
 
-		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		preempt_disable();
+		zp = this_zone_pagesets();
+		pcp = &zone_pageset(zp, zone)->pcp[cold];
+
 		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
@@ -668,7 +700,7 @@ buffered_rmqueue(struct zone *zone, int 
 			pcp->count--;
 		}
 		local_irq_restore(flags);
-		put_cpu();
+		preempt_enable();
 	}
 
 	if (page == NULL) {
@@ -1225,13 +1257,15 @@ void show_free_areas(void)
 		} else
 			printk("\n");
 
-		for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+		for_each_cpu(cpu) {
+			struct zone_pagesets *zp;
 			struct per_cpu_pageset *pageset;
 
 			if (!cpu_possible(cpu))
 				continue;
 
-			pageset = zone->pageset + cpu;
+			zp = cpu_zone_pagesets(cpu);
+			pageset = zone_pageset(zp, zone);
 
 			for (temperature = 0; temperature < 2; temperature++)
 				printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1511,6 +1545,62 @@ void __init build_all_zonelists(void)
 	cpuset_init_current_mems_allowed();
 }
 
+void __init build_percpu_pagelists(void)
+{
+	pg_data_t *pgdat;
+
+	for_each_pgdat(pgdat) {
+		int j;
+		int nid = pgdat->node_id;
+
+		for (j = 0; j < MAX_NR_ZONES; j++) {
+			struct zone *zone = pgdat->node_zones + j;
+			int cpu;
+			unsigned long batch;
+		
+			/*
+			 * The per-cpu-pages pools are set to around 1000th of
+			 * the size of the zone.  But no more than 1/4 of a meg
+			 * - there's no point in going beyond the size of L2
+			 *   cache.
+			 *
+			 * OK, so we don't know how big the cache is.  So guess.
+			 */
+			batch = zone->present_pages / 1024;
+			if (batch * PAGE_SIZE > 256 * 1024)
+				batch = (256 * 1024) / PAGE_SIZE;
+			batch /= 4;		/* We effectively *= 4 below */
+			if (batch < 1)
+				batch = 1;
+
+			for (cpu = 0; cpu < NR_CPUS; cpu++) {
+				struct zone_pagesets *zp;
+				struct per_cpu_pageset *pageset;
+				struct per_cpu_pages *pcp;
+			
+				zp = cpu_zone_pagesets(cpu);
+				pageset = &zp->p[NODEZONE(nid, j)];
+
+				pcp = &pageset->pcp[0];	/* hot */
+				pcp->count = 0;
+				pcp->low = 2 * batch;
+				pcp->high = 6 * batch;
+				pcp->batch = 1 * batch;
+				INIT_LIST_HEAD(&pcp->list);
+
+				pcp = &pageset->pcp[1];	/* cold */
+				pcp->count = 0;
+				pcp->low = 0;
+				pcp->high = 2 * batch;
+				pcp->batch = 1 * batch;
+				INIT_LIST_HEAD(&pcp->list);
+			}
+			printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+					zone_names[j], zone->present_pages, batch);
+		}
+	}
+}
+
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
@@ -1626,7 +1716,7 @@ static void __init free_area_init_core(s
 {
 	unsigned long i, j;
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
-	int cpu, nid = pgdat->node_id;
+	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
 	pgdat->nr_zones = 0;
@@ -1636,7 +1726,6 @@ static void __init free_area_init_core(s
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
-		unsigned long batch;
 
 		zone_table[NODEZONE(nid, j)] = zone;
 		realsize = size = zones_size[j];
@@ -1657,39 +1746,6 @@ static void __init free_area_init_core(s
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
-		/*
-		 * The per-cpu-pages pools are set to around 1000th of the
-		 * size of the zone.  But no more than 1/4 of a meg - there's
-		 * no point in going beyond the size of L2 cache.
-		 *
-		 * OK, so we don't know how big the cache is.  So guess.
-		 */
-		batch = zone->present_pages / 1024;
-		if (batch * PAGE_SIZE > 256 * 1024)
-			batch = (256 * 1024) / PAGE_SIZE;
-		batch /= 4;		/* We effectively *= 4 below */
-		if (batch < 1)
-			batch = 1;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
-			pcp->count = 0;
-			pcp->low = 2 * batch;
-			pcp->high = 6 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-
-			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
-			pcp->count = 0;
-			pcp->low = 0;
-			pcp->high = 2 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-		}
-		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-				zone_names[j], realsize, batch);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
@@ -1720,7 +1776,6 @@ static void __init free_area_init_core(s
 
 		if ((zone_start_pfn) & (zone_required_alignment-1))
 			printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
-
 		memmap_init(size, nid, j, zone_start_pfn);
 
 		zone_start_pfn += size;
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c	2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/init/main.c	2005-04-09 22:35:44.000000000 +1000
@@ -454,6 +454,7 @@ asmlinkage void __init start_kernel(void
 	 */
 	preempt_disable();
 	build_all_zonelists();
+	build_percpu_pagelists();
 	page_alloc_init();
 	printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
 	parse_early_param();
Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c	2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/mm/mempolicy.c	2005-04-09 22:35:44.000000000 +1000
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleav
 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zl->zones[0]) {
-		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+		zl->zones[0]->stats[get_cpu()].interleave_hit++;
 		put_cpu();
 	}
 	return page;

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 2/4] pcp: dynamic lists
  2005-04-09 13:23 [patch 1/4] pcp: zonequeues Nick Piggin
@ 2005-04-09 13:24 ` Nick Piggin
  2005-04-09 13:24   ` [patch 3/4] pcp: NULL pagesets Nick Piggin
  2005-04-09 13:28 ` [patch 1/4] pcp: zonequeues Nick Piggin
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2005-04-09 13:24 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 32 bytes --]

2/4

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: pcp-dynamic-lists.patch --]
[-- Type: text/plain, Size: 1108 bytes --]

Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2005-04-09 22:35:44.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2005-04-09 22:43:58.000000000 +1000
@@ -70,7 +70,7 @@ struct zone *zone_table[1 << (ZONES_SHIF
 EXPORT_SYMBOL(zone_table);
 
 struct zone_pagesets {
-	struct per_cpu_pageset p[TOTAL_ZONES];
+	struct per_cpu_pageset *p[TOTAL_ZONES];
 };
 
 #define this_zone_pagesets()	(&__get_cpu_var(zone_pagesets))
@@ -80,7 +80,7 @@ struct zone_pagesets {
 	(NODEZONE((zone)->zone_pgdat->node_id, zone_idx(zone)))
 
 #define zone_pageset(zp, zone)		\
-	(&zp->p[zone_pagesets_idx(zone)])
+	(zp->p[zone_pagesets_idx(zone)])
 
 /*
  * List of pointers to per_cpu_pagesets for each zone.
@@ -1579,7 +1579,8 @@ void __init build_percpu_pagelists(void)
 				struct per_cpu_pages *pcp;
 			
 				zp = cpu_zone_pagesets(cpu);
-				pageset = &zp->p[NODEZONE(nid, j)];
+				pageset = alloc_bootmem_node(pgdat, sizeof(*pageset));
+				zp->p[NODEZONE(nid, j)] = pageset;
 
 				pcp = &pageset->pcp[0];	/* hot */
 				pcp->count = 0;

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 3/4] pcp: NULL pagesets
  2005-04-09 13:24 ` [patch 2/4] pcp: dynamic lists Nick Piggin
@ 2005-04-09 13:24   ` Nick Piggin
  2005-04-09 13:25     ` [patch 4/4] pcp: only local pagesets Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2005-04-09 13:24 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 4 bytes --]

3/4

[-- Attachment #2: pcp-null-pagesets.patch --]
[-- Type: text/plain, Size: 7400 bytes --]

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2005-04-09 22:44:48.000000000 +1000
+++ linux-2.6/include/linux/mmzone.h	2005-04-09 22:45:07.000000000 +1000
@@ -128,6 +128,9 @@ struct zone {
 	struct per_cpu_zone_stats stats[NR_CPUS];
 #endif
 
+	/* global pageset for CPUs without private pagesets for this zone */
+	struct per_cpu_pageset	pageset;
+
 	/*
 	 * free areas of different sizes
 	 */
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2005-04-09 22:45:06.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2005-04-09 22:45:07.000000000 +1000
@@ -84,7 +84,6 @@ struct zone_pagesets {
 
 /*
  * List of pointers to per_cpu_pagesets for each zone.
- * XXX: put this comment in a future patch that actually enables NULLs here
  * It is used as a per-CPU set. A value of NULL in any pointer indicates
  * this CPU doesn't have a pageset for this zone, and should use the public
  * pageset.
@@ -363,13 +362,14 @@ static inline void free_pages_check(cons
  */
 static int
 free_pages_bulk(struct zone *zone, int count,
-		struct list_head *list, unsigned int order)
+		struct list_head *list, unsigned int order, const int locked)
 {
-	unsigned long flags;
+	unsigned long flags = 0; /* shut up gcc */
 	struct page *page = NULL;
 	int ret = 0;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	if (likely(!locked))
+		spin_lock_irqsave(&zone->lock, flags);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (!list_empty(list) && count--) {
@@ -379,7 +379,8 @@ free_pages_bulk(struct zone *zone, int c
 		__free_pages_bulk(page, zone, order);
 		ret++;
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
+	if (likely(!locked))
+		spin_unlock_irqrestore(&zone->lock, flags);
 	return ret;
 }
 
@@ -402,7 +403,7 @@ void __free_pages_ok(struct page *page, 
 		free_pages_check(__FUNCTION__, page + i);
 	list_add(&page->lru, &list);
 	kernel_map_pages(page, 1<<order, 0);
-	free_pages_bulk(page_zone(page), 1, &list, order);
+	free_pages_bulk(page_zone(page), 1, &list, order, 0);
 }
 
 
@@ -512,14 +513,15 @@ static struct page *__rmqueue(struct zon
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-			unsigned long count, struct list_head *list)
+		unsigned long count, struct list_head *list, const int locked)
 {
-	unsigned long flags;
+	unsigned long flags = 0; /* shut up gcc */
 	int i;
 	int allocated = 0;
 	struct page *page;
 	
-	spin_lock_irqsave(&zone->lock, flags);
+	if (likely(!locked))
+		spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
 		page = __rmqueue(zone, order);
 		if (page == NULL)
@@ -527,7 +529,8 @@ static int rmqueue_bulk(struct zone *zon
 		allocated++;
 		list_add_tail(&page->lru, list);
 	}
-	spin_unlock_irqrestore(&zone->lock, flags);
+	if (likely(!locked))
+		spin_unlock_irqrestore(&zone->lock, flags);
 	return allocated;
 }
 
@@ -541,13 +544,15 @@ static void __drain_pages(unsigned int c
 	/* XXX: this can be a for i = 0 .. TOTAL_ZONES loop */
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset = zone_pageset(zp, zone);
+		if (unlikely(!pset))
+			continue;
 
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
 			pcp->count -= free_pages_bulk(zone, pcp->count,
-						&pcp->list, 0);
+						&pcp->list, 0, 0);
 		}
 	}
 }
@@ -627,9 +632,11 @@ static void FASTCALL(free_hot_cold_page(
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone_pagesets *zp;
-	struct zone *zone = page_zone(page);
+	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
+	struct zone *zone = page_zone(page);
 	unsigned long flags;
+	int locked = 0;
 
 	arch_free_page(page, 0);
 
@@ -641,12 +648,23 @@ static void fastcall free_hot_cold_page(
 
 	preempt_disable();
 	zp = this_zone_pagesets();
-	pcp = &zone_pageset(zp, zone)->pcp[cold];
-	local_irq_save(flags);
+	pset = zone_pageset(zp, zone);
+	if (unlikely(!pset)) {
+		locked = 1;
+		pset = &zone->pageset;
+		spin_lock_irqsave(&zone->lock, flags);
+	} else
+		local_irq_save(flags);
+	
+	pcp = &pset->pcp[cold];
 	if (pcp->count >= pcp->high)
-		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list,
+						0, locked);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
+
+	if (unlikely(locked))
+		spin_unlock(&zone->lock);
 	local_irq_restore(flags);
 	preempt_enable();
 }
@@ -683,22 +701,33 @@ buffered_rmqueue(struct zone *zone, int 
 	int cold = !!(gfp_flags & __GFP_COLD);
 
 	if (order == 0) {
+		int locked = 0;
 		struct zone_pagesets *zp;
+		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
 		preempt_disable();
 		zp = this_zone_pagesets();
-		pcp = &zone_pageset(zp, zone)->pcp[cold];
+		pset = zone_pageset(zp, zone);
+		if (unlikely(!pset)) {
+			locked = 1;
+			pset = &zone->pageset;
+			spin_lock_irqsave(&zone->lock, flags);
+		} else
+			local_irq_save(flags);
+
+		pcp = &pset->pcp[cold];
 
-		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, &pcp->list);
+					pcp->batch, &pcp->list, locked);
 		if (pcp->count) {
 			page = list_entry(pcp->list.next, struct page, lru);
 			list_del(&page->lru);
 			pcp->count--;
 		}
+		if (unlikely(locked))
+			spin_unlock(&zone->lock);
 		local_irq_restore(flags);
 		preempt_enable();
 	}
@@ -1266,6 +1295,8 @@ void show_free_areas(void)
 
 			zp = cpu_zone_pagesets(cpu);
 			pageset = zone_pageset(zp, zone);
+			if (!pageset)
+				continue;
 
 			for (temperature = 0; temperature < 2; temperature++)
 				printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1545,6 +1576,25 @@ void __init build_all_zonelists(void)
 	cpuset_init_current_mems_allowed();
 }
 
+static void __init init_percpu_pageset(struct per_cpu_pageset *pset, int batch)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &pset->pcp[0];	/* hot */
+	pcp->count = 0;
+	pcp->low = 2 * batch;
+	pcp->high = 6 * batch;
+	pcp->batch = 1 * batch;
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &pset->pcp[1];	/* cold */
+	pcp->count = 0;
+	pcp->low = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = 1 * batch;
+	INIT_LIST_HEAD(&pcp->list);
+}
+
 void __init build_percpu_pagelists(void)
 {
 	pg_data_t *pgdat;
@@ -1573,28 +1623,16 @@ void __init build_percpu_pagelists(void)
 			if (batch < 1)
 				batch = 1;
 
+			init_percpu_pageset(&zone->pageset, batch);
 			for (cpu = 0; cpu < NR_CPUS; cpu++) {
 				struct zone_pagesets *zp;
 				struct per_cpu_pageset *pageset;
-				struct per_cpu_pages *pcp;
 			
 				zp = cpu_zone_pagesets(cpu);
 				pageset = alloc_bootmem_node(pgdat, sizeof(*pageset));
+				init_percpu_pageset(pageset, batch);
 				zp->p[NODEZONE(nid, j)] = pageset;
 
-				pcp = &pageset->pcp[0];	/* hot */
-				pcp->count = 0;
-				pcp->low = 2 * batch;
-				pcp->high = 6 * batch;
-				pcp->batch = 1 * batch;
-				INIT_LIST_HEAD(&pcp->list);
-
-				pcp = &pageset->pcp[1];	/* cold */
-				pcp->count = 0;
-				pcp->low = 0;
-				pcp->high = 2 * batch;
-				pcp->batch = 1 * batch;
-				INIT_LIST_HEAD(&pcp->list);
 			}
 			printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 					zone_names[j], zone->present_pages, batch);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [patch 4/4] pcp: only local pagesets
  2005-04-09 13:24   ` [patch 3/4] pcp: NULL pagesets Nick Piggin
@ 2005-04-09 13:25     ` Nick Piggin
  0 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2005-04-09 13:25 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Linux Memory Management

[-- Attachment #1: Type: text/plain, Size: 32 bytes --]

4/4

-- 
SUSE Labs, Novell Inc.

[-- Attachment #2: pcp-only-local-pagesets.patch --]
[-- Type: text/plain, Size: 1171 bytes --]

Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2005-04-09 22:45:07.000000000 +1000
+++ linux-2.6/mm/page_alloc.c	2005-04-09 23:13:53.000000000 +1000
@@ -1626,14 +1626,24 @@ void __init build_percpu_pagelists(void)
 			init_percpu_pageset(&zone->pageset, batch);
 			for (cpu = 0; cpu < NR_CPUS; cpu++) {
 				struct zone_pagesets *zp;
-				struct per_cpu_pageset *pageset;
+				struct per_cpu_pageset *pageset = NULL;
 			
 				zp = cpu_zone_pagesets(cpu);
-				pageset = alloc_bootmem_node(pgdat, sizeof(*pageset));
-				init_percpu_pageset(pageset, batch);
+				
+				/*
+				 * XXX: this test could be something like
+				 *   if (node_distance <= blah)
+				 * which would allow pagesets on close
+				 * remote nodes as well as the local node.
+				 */
+				if (cpu_to_node(cpu) == nid) {
+					pageset = alloc_bootmem_node(pgdat,
+							sizeof(*pageset));
+					init_percpu_pageset(pageset, batch);
+				}
 				zp->p[NODEZONE(nid, j)] = pageset;
-
 			}
+
 			printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 					zone_names[j], zone->present_pages, batch);
 		}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/4] pcp: zonequeues
  2005-04-09 13:23 [patch 1/4] pcp: zonequeues Nick Piggin
  2005-04-09 13:24 ` [patch 2/4] pcp: dynamic lists Nick Piggin
@ 2005-04-09 13:28 ` Nick Piggin
  2005-04-09 15:25 ` Nick Piggin
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2005-04-09 13:28 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Linux Memory Management

Nick Piggin wrote:

> Not tested (only compiled) on a NUMA system, but the NULL pagesets
> logic appears to work OK. Boots on a small UMA SMP system. So just
> be careful with it.
> 
> Comments?
> 

Oh, and you may want to look at increasing the pageset and batch
sizes if you look at running this on a bigger system. Now that
there are reduced pagesets, it might help improve lock contention
and allocation efficiency.

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/4] pcp: zonequeues
  2005-04-09 13:23 [patch 1/4] pcp: zonequeues Nick Piggin
  2005-04-09 13:24 ` [patch 2/4] pcp: dynamic lists Nick Piggin
  2005-04-09 13:28 ` [patch 1/4] pcp: zonequeues Nick Piggin
@ 2005-04-09 15:25 ` Nick Piggin
  2005-04-12 16:15 ` Jack Steiner
  2005-04-12 19:02 ` Christoph Lameter
  4 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2005-04-09 15:25 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Linux Memory Management

Nick Piggin wrote:
> Hi Jack,
> Was thinking about some problems in this area, and I hacked up
> a possible implementation to improve things.
> 
> 1/4 switches the per cpu pagesets in struct zone to a single list
> of zone pagesets for each CPU.
> 

Just thinking out loud here... this patch (or something like it)
would probably be a good idea regardless of the remote pageset
removal patches following it.

Shouldn't be any changes in behaviour, but it gives you remote
pagesets in local memory and hopefully better cache behaviour due
to less packing needed, and the use of percpu.

But...

> +
> +struct per_cpu_zone_stats {
>  	unsigned long numa_hit;		/* allocated in intended node */
>  	unsigned long numa_miss;	/* allocated in non intended node */
>  	unsigned long numa_foreign;	/* was intended here, hit elsewhere */
>  	unsigned long interleave_hit; 	/* interleaver prefered this zone */
>  	unsigned long local_node;	/* allocation from local node */
>  	unsigned long other_node;	/* allocation from other node */
> -#endif
>  } ____cacheline_aligned_in_smp;
>  
>  #define ZONE_DMA		0
> @@ -113,16 +114,19 @@ struct zone {
>  	unsigned long		free_pages;
>  	unsigned long		pages_min, pages_low, pages_high;
>  	/*
> -	 * We don't know if the memory that we're going to allocate will be freeable
> -	 * or/and it will be released eventually, so to avoid totally wasting several
> -	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
> -	 * to run OOM on the lower zones despite there's tons of freeable ram
> -	 * on the higher zones). This array is recalculated at runtime if the
> -	 * sysctl_lowmem_reserve_ratio sysctl changes.
> +	 * We don't know if the memory that we're going to allocate will be
> +	 * freeable or/and it will be released eventually, so to avoid totally
> +	 * wasting several GB of ram we must reserve some of the lower zone
> +	 * memory (otherwise we risk to run OOM on the lower zones despite
> +	 * there's tons of freeable ram on the higher zones). This array is
> +	 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
> +	 * changes.
>  	 */
>  	unsigned long		lowmem_reserve[MAX_NR_ZONES];
>  
> -	struct per_cpu_pageset	pageset[NR_CPUS];
> +#ifdef CONFIG_NUMA
> +	struct per_cpu_zone_stats stats[NR_CPUS];
> +#endif
>  

I wonder if this stats information should be in the pageset there
in local memory as well? I initially moved it to its own structure
so that the zone queues could be completely confined to page_alloc.

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/4] pcp: zonequeues
  2005-04-09 13:23 [patch 1/4] pcp: zonequeues Nick Piggin
                   ` (2 preceding siblings ...)
  2005-04-09 15:25 ` Nick Piggin
@ 2005-04-12 16:15 ` Jack Steiner
  2005-04-13  1:34   ` Nick Piggin
  2005-04-12 19:02 ` Christoph Lameter
  4 siblings, 1 reply; 10+ messages in thread
From: Jack Steiner @ 2005-04-12 16:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Memory Management

On Sat, Apr 09, 2005 at 11:23:24PM +1000, Nick Piggin wrote:
> Hi Jack,
> Was thinking about some problems in this area, and I hacked up
> a possible implementation to improve things.
> 
> 1/4 switches the per cpu pagesets in struct zone to a single list
> of zone pagesets for each CPU.
> 
> 2/4 changes the per cpu list of pagesets to a list of pointers to
> pagesets, and allocates them dynamically.
> 
> 3/4 changes the code to allow NULL pagesets. In that case, a single
> per-zone pageset is used, which is protected by the zone's spinlock.
> 
> 4/4 changes setup so non local zones don't have associated pagesets.
> 
> It still needs some work - in particular, many NUMA systems probably
> don't want this. I guess benchmarks should be done, and maybe we
> could look at disabling the overhead of 3/4 and functional change of
> 4/4 depending on a CONFIG_ option.
> 
> Also, you say you might want "close" remote nodes to have pagesets,
> but 4/4 only does local nodes. I added a comment with patch 4/4
> marked with XXX which should allow you to do this quite easily.
> 
> Not tested (only compiled) on a NUMA system, but the NULL pagesets
> logic appears to work OK. Boots on a small UMA SMP system. So just
> be careful with it.
> 
> Comments?
> 

Nick

I tested the patch. I found one spot that was missed  with the NUMA 
statistics but everything else looks fine. The patches fix both problems
that I found - bad coloring & excessive pages in pagesets.



Signed-off-by: Jack Steiner <steiner@sgi.com>


Index: linux/drivers/base/node.c
===================================================================
--- linux.orig/drivers/base/node.c	2005-04-07 15:12:14.750749661 -0500
+++ linux/drivers/base/node.c	2005-04-12 10:54:45.324306797 -0500
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		struct zone *z = &pg->node_zones[i];
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pageset *ps = &z->pageset[cpu];
+			struct per_cpu_zone_stats *ps = &z->stats[cpu];
 			numa_hit += ps->numa_hit;
 			numa_miss += ps->numa_miss;
 			numa_foreign += ps->numa_foreign;

-- 
Thanks

Jack Steiner (steiner@sgi.com)          651-683-5302
Principal Engineer                      SGI - Silicon Graphics, Inc.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/4] pcp: zonequeues
  2005-04-09 13:23 [patch 1/4] pcp: zonequeues Nick Piggin
                   ` (3 preceding siblings ...)
  2005-04-12 16:15 ` Jack Steiner
@ 2005-04-12 19:02 ` Christoph Lameter
  2005-04-13  1:40   ` Nick Piggin
  4 siblings, 1 reply; 10+ messages in thread
From: Christoph Lameter @ 2005-04-12 19:02 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Jack Steiner, Linux Memory Management

Seems that this also effectively addresses the issues raised with the
pageset localization patches. Great work Nick!

On Sat, 9 Apr 2005, Nick Piggin wrote:

> Hi Jack,
> Was thinking about some problems in this area, and I hacked up
> a possible implementation to improve things.
>
> 1/4 switches the per cpu pagesets in struct zone to a single list
> of zone pagesets for each CPU.
>
> 2/4 changes the per cpu list of pagesets to a list of pointers to
> pagesets, and allocates them dynamically.
>
> 3/4 changes the code to allow NULL pagesets. In that case, a single
> per-zone pageset is used, which is protected by the zone's spinlock.
>
> 4/4 changes setup so non local zones don't have associated pagesets.
>
> It still needs some work - in particular, many NUMA systems probably
> don't want this. I guess benchmarks should be done, and maybe we
> could look at disabling the overhead of 3/4 and functional change of
> 4/4 depending on a CONFIG_ option.
>
> Also, you say you might want "close" remote nodes to have pagesets,
> but 4/4 only does local nodes. I added a comment with patch 4/4
> marked with XXX which should allow you to do this quite easily.
>
> Not tested (only compiled) on a NUMA system, but the NULL pagesets
> logic appears to work OK. Boots on a small UMA SMP system. So just
> be careful with it.
>
> Comments?
>
> --
> SUSE Labs, Novell Inc.
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/4] pcp: zonequeues
  2005-04-12 16:15 ` Jack Steiner
@ 2005-04-13  1:34   ` Nick Piggin
  0 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2005-04-13  1:34 UTC (permalink / raw)
  To: Jack Steiner; +Cc: Linux Memory Management

Jack Steiner wrote:
> On Sat, Apr 09, 2005 at 11:23:24PM +1000, Nick Piggin wrote:

>>Comments?
>>
> 
> 
> Nick
> 
> I tested the patch. I found one spot that was missed  with the NUMA 
> statistics but everything else looks fine. The patches fix both problems
> that I found - bad coloring & excessive pages in pagesets.
> 

Thanks. I'll think about how to make them more acceptable
for merging.

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [patch 1/4] pcp: zonequeues
  2005-04-12 19:02 ` Christoph Lameter
@ 2005-04-13  1:40   ` Nick Piggin
  0 siblings, 0 replies; 10+ messages in thread
From: Nick Piggin @ 2005-04-13  1:40 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Jack Steiner, Linux Memory Management

Christoph Lameter wrote:
> Seems that this also effectively addresses the issues raised with the
> pageset localization patches. Great work Nick!
> 

I'd be interested to know what performance and lock contention
looks like on your larger systems, because we're using lru_lock
for the remote pageset... and there's only one of them.

Your interleaved pagecache allocation policy should be a good
brute force benchmark - just have one or two processes on each
node allocating pagecache pages (eg. from reading huge sparse
files).

The other thing is, you may want to look at adjusting the
criteria for using the remote pageset. It might be helpful to
use per-cpu pagesets on near remote nodes...

Nick

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2005-04-13  1:40 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-04-09 13:23 [patch 1/4] pcp: zonequeues Nick Piggin
2005-04-09 13:24 ` [patch 2/4] pcp: dynamic lists Nick Piggin
2005-04-09 13:24   ` [patch 3/4] pcp: NULL pagesets Nick Piggin
2005-04-09 13:25     ` [patch 4/4] pcp: only local pagesets Nick Piggin
2005-04-09 13:28 ` [patch 1/4] pcp: zonequeues Nick Piggin
2005-04-09 15:25 ` Nick Piggin
2005-04-12 16:15 ` Jack Steiner
2005-04-13  1:34   ` Nick Piggin
2005-04-12 19:02 ` Christoph Lameter
2005-04-13  1:40   ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox