From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Jack Steiner <steiner@sgi.com>,
Linux Memory Management <linux-mm@kvack.org>
Subject: [patch 1/4] pcp: zonequeues
Date: Sat, 09 Apr 2005 23:23:24 +1000 [thread overview]
Message-ID: <4257D74C.3010703@yahoo.com.au> (raw)
[-- Attachment #1: Type: text/plain, Size: 1162 bytes --]
Hi Jack,
Was thinking about some problems in this area, and I hacked up
a possible implementation to improve things.
1/4 switches the per cpu pagesets in struct zone to a single list
of zone pagesets for each CPU.
2/4 changes the per cpu list of pagesets to a list of pointers to
pagesets, and allocates them dynamically.
3/4 changes the code to allow NULL pagesets. In that case, a single
per-zone pageset is used, which is protected by the zone's spinlock.
4/4 changes setup so non local zones don't have associated pagesets.
It still needs some work - in particular, many NUMA systems probably
don't want this. I guess benchmarks should be done, and maybe we
could look at disabling the overhead of 3/4 and functional change of
4/4 depending on a CONFIG_ option.
Also, you say you might want "close" remote nodes to have pagesets,
but 4/4 only does local nodes. I added a comment with patch 4/4
marked with XXX which should allow you to do this quite easily.
Not tested (only compiled) on a NUMA system, but the NULL pagesets
logic appears to work OK. Boots on a small UMA SMP system. So just
be careful with it.
Comments?
--
SUSE Labs, Novell Inc.
[-- Attachment #2: pcp-zonequeues.patch --]
[-- Type: text/plain, Size: 12441 bytes --]
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/include/linux/mmzone.h 2005-04-09 22:44:48.000000000 +1000
@@ -53,14 +53,15 @@ struct per_cpu_pages {
struct per_cpu_pageset {
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
-#ifdef CONFIG_NUMA
+};
+
+struct per_cpu_zone_stats {
unsigned long numa_hit; /* allocated in intended node */
unsigned long numa_miss; /* allocated in non intended node */
unsigned long numa_foreign; /* was intended here, hit elsewhere */
unsigned long interleave_hit; /* interleaver prefered this zone */
unsigned long local_node; /* allocation from local node */
unsigned long other_node; /* allocation from other node */
-#endif
} ____cacheline_aligned_in_smp;
#define ZONE_DMA 0
@@ -113,16 +114,19 @@ struct zone {
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
/*
- * We don't know if the memory that we're going to allocate will be freeable
- * or/and it will be released eventually, so to avoid totally wasting several
- * GB of ram we must reserve some of the lower zone memory (otherwise we risk
- * to run OOM on the lower zones despite there's tons of freeable ram
- * on the higher zones). This array is recalculated at runtime if the
- * sysctl_lowmem_reserve_ratio sysctl changes.
+ * We don't know if the memory that we're going to allocate will be
+ * freeable or/and it will be released eventually, so to avoid totally
+ * wasting several GB of ram we must reserve some of the lower zone
+ * memory (otherwise we risk to run OOM on the lower zones despite
+ * there's tons of freeable ram on the higher zones). This array is
+ * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
+ * changes.
*/
unsigned long lowmem_reserve[MAX_NR_ZONES];
- struct per_cpu_pageset pageset[NR_CPUS];
+#ifdef CONFIG_NUMA
+ struct per_cpu_zone_stats stats[NR_CPUS];
+#endif
/*
* free areas of different sizes
@@ -220,6 +224,8 @@ struct zone {
*/
#define DEF_PRIORITY 12
+#define TOTAL_ZONES (MAX_NUMNODES * MAX_NR_ZONES)
+
/*
* One allocation request operates on a zonelist. A zonelist
* is a list of zones, the first one is the 'goal' of the
@@ -232,10 +238,9 @@ struct zone {
* footprint of this construct is very small.
*/
struct zonelist {
- struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
+ struct zone *zones[TOTAL_ZONES + 1]; // NULL delimited
};
-
/*
* The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
* (mostly NUMA machines?) to denote a higher-level memory zone than the
@@ -275,6 +280,7 @@ void __get_zone_counts(unsigned long *ac
void get_zone_counts(unsigned long *active, unsigned long *inactive,
unsigned long *free);
void build_all_zonelists(void);
+void build_percpu_pagelists(void);
void wakeup_kswapd(struct zone *zone, int order);
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int alloc_type, int can_try_harder, int gfp_high);
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/mm/page_alloc.c 2005-04-09 22:44:55.000000000 +1000
@@ -69,6 +69,28 @@ EXPORT_SYMBOL(nr_swap_pages);
struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
EXPORT_SYMBOL(zone_table);
+struct zone_pagesets {
+ struct per_cpu_pageset p[TOTAL_ZONES];
+};
+
+#define this_zone_pagesets() (&__get_cpu_var(zone_pagesets))
+#define cpu_zone_pagesets(cpu) (&per_cpu(zone_pagesets, (cpu)))
+
+#define zone_pagesets_idx(zone) \
+ (NODEZONE((zone)->zone_pgdat->node_id, zone_idx(zone)))
+
+#define zone_pageset(zp, zone) \
+ (&zp->p[zone_pagesets_idx(zone)])
+
+/*
+ * List of pointers to per_cpu_pagesets for each zone.
+ * XXX: put this comment in a future patch that actually enables NULLs here
+ * It is used as a per-CPU set. A value of NULL in any pointer indicates
+ * this CPU doesn't have a pageset for this zone, and should use the public
+ * pageset.
+ */
+static DEFINE_PER_CPU(struct zone_pagesets, zone_pagesets);
+
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
@@ -512,13 +534,14 @@ static int rmqueue_bulk(struct zone *zon
#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
+ struct zone_pagesets *zp = cpu_zone_pagesets(cpu);
struct zone *zone;
int i;
+ /* XXX: this can be a for i = 0 .. TOTAL_ZONES loop */
for_each_zone(zone) {
- struct per_cpu_pageset *pset;
+ struct per_cpu_pageset *pset = zone_pageset(zp, zone);
- pset = &zone->pageset[cpu];
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -577,21 +600,22 @@ static void zone_statistics(struct zonel
int cpu;
pg_data_t *pg = z->zone_pgdat;
pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
- struct per_cpu_pageset *p;
+ struct per_cpu_zone_stats *stats;
local_irq_save(flags);
cpu = smp_processor_id();
- p = &z->pageset[cpu];
+ stats = &z->stats[cpu];
+
if (pg == orig) {
- z->pageset[cpu].numa_hit++;
+ stats->numa_hit++;
} else {
- p->numa_miss++;
- zonelist->zones[0]->pageset[cpu].numa_foreign++;
+ stats->numa_miss++;
+ zonelist->zones[0]->stats[cpu].numa_foreign++;
}
if (pg == NODE_DATA(numa_node_id()))
- p->local_node++;
+ stats->local_node++;
else
- p->other_node++;
+ stats->other_node++;
local_irq_restore(flags);
#endif
}
@@ -602,6 +626,7 @@ static void zone_statistics(struct zonel
static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
static void fastcall free_hot_cold_page(struct page *page, int cold)
{
+ struct zone_pagesets *zp;
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
@@ -613,14 +638,17 @@ static void fastcall free_hot_cold_page(
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+
+ preempt_disable();
+ zp = this_zone_pagesets();
+ pcp = &zone_pageset(zp, zone)->pcp[cold];
local_irq_save(flags);
if (pcp->count >= pcp->high)
pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
list_add(&page->lru, &pcp->list);
pcp->count++;
local_irq_restore(flags);
- put_cpu();
+ preempt_enable();
}
void fastcall free_hot_page(struct page *page)
@@ -655,9 +683,13 @@ buffered_rmqueue(struct zone *zone, int
int cold = !!(gfp_flags & __GFP_COLD);
if (order == 0) {
+ struct zone_pagesets *zp;
struct per_cpu_pages *pcp;
- pcp = &zone->pageset[get_cpu()].pcp[cold];
+ preempt_disable();
+ zp = this_zone_pagesets();
+ pcp = &zone_pageset(zp, zone)->pcp[cold];
+
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
@@ -668,7 +700,7 @@ buffered_rmqueue(struct zone *zone, int
pcp->count--;
}
local_irq_restore(flags);
- put_cpu();
+ preempt_enable();
}
if (page == NULL) {
@@ -1225,13 +1257,15 @@ void show_free_areas(void)
} else
printk("\n");
- for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ for_each_cpu(cpu) {
+ struct zone_pagesets *zp;
struct per_cpu_pageset *pageset;
if (!cpu_possible(cpu))
continue;
- pageset = zone->pageset + cpu;
+ zp = cpu_zone_pagesets(cpu);
+ pageset = zone_pageset(zp, zone);
for (temperature = 0; temperature < 2; temperature++)
printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1511,6 +1545,62 @@ void __init build_all_zonelists(void)
cpuset_init_current_mems_allowed();
}
+void __init build_percpu_pagelists(void)
+{
+ pg_data_t *pgdat;
+
+ for_each_pgdat(pgdat) {
+ int j;
+ int nid = pgdat->node_id;
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = pgdat->node_zones + j;
+ int cpu;
+ unsigned long batch;
+
+ /*
+ * The per-cpu-pages pools are set to around 1000th of
+ * the size of the zone. But no more than 1/4 of a meg
+ * - there's no point in going beyond the size of L2
+ * cache.
+ *
+ * OK, so we don't know how big the cache is. So guess.
+ */
+ batch = zone->present_pages / 1024;
+ if (batch * PAGE_SIZE > 256 * 1024)
+ batch = (256 * 1024) / PAGE_SIZE;
+ batch /= 4; /* We effectively *= 4 below */
+ if (batch < 1)
+ batch = 1;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct zone_pagesets *zp;
+ struct per_cpu_pageset *pageset;
+ struct per_cpu_pages *pcp;
+
+ zp = cpu_zone_pagesets(cpu);
+ pageset = &zp->p[NODEZONE(nid, j)];
+
+ pcp = &pageset->pcp[0]; /* hot */
+ pcp->count = 0;
+ pcp->low = 2 * batch;
+ pcp->high = 6 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+
+ pcp = &pageset->pcp[1]; /* cold */
+ pcp->count = 0;
+ pcp->low = 0;
+ pcp->high = 2 * batch;
+ pcp->batch = 1 * batch;
+ INIT_LIST_HEAD(&pcp->list);
+ }
+ printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
+ zone_names[j], zone->present_pages, batch);
+ }
+ }
+}
+
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
@@ -1626,7 +1716,7 @@ static void __init free_area_init_core(s
{
unsigned long i, j;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
- int cpu, nid = pgdat->node_id;
+ int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
pgdat->nr_zones = 0;
@@ -1636,7 +1726,6 @@ static void __init free_area_init_core(s
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize;
- unsigned long batch;
zone_table[NODEZONE(nid, j)] = zone;
realsize = size = zones_size[j];
@@ -1657,39 +1746,6 @@ static void __init free_area_init_core(s
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
- /*
- * The per-cpu-pages pools are set to around 1000th of the
- * size of the zone. But no more than 1/4 of a meg - there's
- * no point in going beyond the size of L2 cache.
- *
- * OK, so we don't know how big the cache is. So guess.
- */
- batch = zone->present_pages / 1024;
- if (batch * PAGE_SIZE > 256 * 1024)
- batch = (256 * 1024) / PAGE_SIZE;
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- struct per_cpu_pages *pcp;
-
- pcp = &zone->pageset[cpu].pcp[0]; /* hot */
- pcp->count = 0;
- pcp->low = 2 * batch;
- pcp->high = 6 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
-
- pcp = &zone->pageset[cpu].pcp[1]; /* cold */
- pcp->count = 0;
- pcp->low = 0;
- pcp->high = 2 * batch;
- pcp->batch = 1 * batch;
- INIT_LIST_HEAD(&pcp->list);
- }
- printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
- zone_names[j], realsize, batch);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
@@ -1720,7 +1776,6 @@ static void __init free_area_init_core(s
if ((zone_start_pfn) & (zone_required_alignment-1))
printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
-
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c 2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/init/main.c 2005-04-09 22:35:44.000000000 +1000
@@ -454,6 +454,7 @@ asmlinkage void __init start_kernel(void
*/
preempt_disable();
build_all_zonelists();
+ build_percpu_pagelists();
page_alloc_init();
printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
parse_early_param();
Index: linux-2.6/mm/mempolicy.c
===================================================================
--- linux-2.6.orig/mm/mempolicy.c 2005-04-09 22:35:25.000000000 +1000
+++ linux-2.6/mm/mempolicy.c 2005-04-09 22:35:44.000000000 +1000
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleav
zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
page = __alloc_pages(gfp, order, zl);
if (page && page_zone(page) == zl->zones[0]) {
- zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+ zl->zones[0]->stats[get_cpu()].interleave_hit++;
put_cpu();
}
return page;
next reply other threads:[~2005-04-09 13:23 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-04-09 13:23 Nick Piggin [this message]
2005-04-09 13:24 ` [patch 2/4] pcp: dynamic lists Nick Piggin
2005-04-09 13:24 ` [patch 3/4] pcp: NULL pagesets Nick Piggin
2005-04-09 13:25 ` [patch 4/4] pcp: only local pagesets Nick Piggin
2005-04-09 13:28 ` [patch 1/4] pcp: zonequeues Nick Piggin
2005-04-09 15:25 ` Nick Piggin
2005-04-12 16:15 ` Jack Steiner
2005-04-13 1:34 ` Nick Piggin
2005-04-12 19:02 ` Christoph Lameter
2005-04-13 1:40 ` Nick Piggin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4257D74C.3010703@yahoo.com.au \
--to=nickpiggin@yahoo.com.au \
--cc=linux-mm@kvack.org \
--cc=steiner@sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox