[PATCH] Page allocator: Get rid of the list of cold pages

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] Page allocator: Get rid of the list of cold pages
@ 2007-11-14 19:52 Christoph Lameter
  2007-11-14 21:13 ` Christoph Lameter
  2007-11-16  0:27 ` Andrew Morton
  0 siblings, 2 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-11-14 19:52 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, apw, Mel Gorman

The discussion of the RFC for this and Mel's measurements indicate that 
there may not be too much of a point left to having separate lists for 
hot and cold pages (see http://marc.info/?t=119492914200001&r=1&w=2). I 
think it is worth taking into mm for further testing. This version is 
against 2.6.24-rc2-mm1.


Page allocator: Get rid of the list of cold pages

We have repeatedly discussed if the cold pages still have a point. There is
one way to join the two lists: Use a single list and put the cold pages at the
end and the hot pages at the beginning. That way a single list can serve for
both types of allocations.

[This version against 2.6.24-rc2-mm1]

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mmzone.h |    2 -
 mm/page_alloc.c        |   54 +++++++++++++++++++++++--------------------------
 mm/vmstat.c            |   30 ++++++++++-----------------
 3 files changed, 39 insertions(+), 47 deletions(-)

Index: linux-2.6.24-rc2-mm1/include/linux/mmzone.h
===================================================================
--- linux-2.6.24-rc2-mm1.orig/include/linux/mmzone.h	2007-11-06 13:57:46.000000000 -0800
+++ linux-2.6.24-rc2-mm1/include/linux/mmzone.h	2007-11-14 11:23:37.597012369 -0800
@@ -113,7 +113,7 @@ struct per_cpu_pages {
 };
 
 struct per_cpu_pageset {
-	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
 #endif
Index: linux-2.6.24-rc2-mm1/mm/vmstat.c
===================================================================
--- linux-2.6.24-rc2-mm1.orig/mm/vmstat.c	2007-11-14 11:10:22.264011944 -0800
+++ linux-2.6.24-rc2-mm1/mm/vmstat.c	2007-11-14 11:31:03.702407648 -0800
@@ -330,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu)
 		 * Check if there are pages remaining in this pageset
 		 * if not then there is nothing to expire.
 		 */
-		if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+		if (!p->expire || !p->pcp.count)
 			continue;
 
 		/*
@@ -345,11 +345,8 @@ void refresh_cpu_vm_stats(int cpu)
 		if (p->expire)
 			continue;
 
-		if (p->pcp[0].count)
-			drain_zone_pages(zone, p->pcp + 0);
-
-		if (p->pcp[1].count)
-			drain_zone_pages(zone, p->pcp + 1);
+		if (p->pcp.count)
+			drain_zone_pages(zone, &p->pcp);
 #endif
 	}
 }
@@ -774,20 +771,17 @@ static void zoneinfo_show_print(struct s
 		   "\n  pagesets");
 	for_each_online_cpu(i) {
 		struct per_cpu_pageset *pageset;
-		int j;
 
 		pageset = zone_pcp(zone, i);
-		for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-			seq_printf(m,
-				   "\n    cpu: %i pcp: %i"
-				   "\n              count: %i"
-				   "\n              high:  %i"
-				   "\n              batch: %i",
-				   i, j,
-				   pageset->pcp[j].count,
-				   pageset->pcp[j].high,
-				   pageset->pcp[j].batch);
-			}
+		seq_printf(m,
+			   "\n    cpu: %i"
+			   "\n              count: %i"
+			   "\n              high:  %i"
+			   "\n              batch: %i",
+			   i,
+			   pageset->pcp.count,
+			   pageset->pcp.high,
+			   pageset->pcp.batch);
 #ifdef CONFIG_SMP
 		seq_printf(m, "\n  vm stats threshold: %d",
 				pageset->stat_threshold);
Index: linux-2.6.24-rc2-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.24-rc2-mm1.orig/mm/page_alloc.c	2007-11-14 11:10:22.220011821 -0800
+++ linux-2.6.24-rc2-mm1/mm/page_alloc.c	2007-11-14 11:28:27.857511982 -0800
@@ -914,20 +914,18 @@ static void drain_pages(unsigned int cpu
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
+		struct per_cpu_pages *pcp;
 
 		if (!populated_zone(zone))
 			continue;
 
 		pset = zone_pcp(zone, cpu);
-		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-			struct per_cpu_pages *pcp;
 
-			pcp = &pset->pcp[i];
-			local_irq_save(flags);
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
-			local_irq_restore(flags);
-		}
+		pcp = &pset->pcp;
+		local_irq_save(flags);
+		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+		pcp->count = 0;
+		local_irq_restore(flags);
 	}
 }
 
@@ -1003,10 +1001,13 @@ static void fastcall free_hot_cold_page(
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
-	list_add(&page->lru, &pcp->list);
+	if (cold)
+		list_add_tail(&page->lru, &pcp->list);
+	else
+		list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
@@ -1064,7 +1065,7 @@ again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 
-		pcp = &zone_pcp(zone, cpu)->pcp[cold];
+		pcp = &zone_pcp(zone, cpu)->pcp;
 		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
@@ -1074,9 +1075,15 @@ again:
 		}
 
 		/* Find a page of the appropriate migrate type */
-		list_for_each_entry(page, &pcp->list, lru)
-			if (page_private(page) == migratetype)
-				break;
+		if (cold) {
+			list_for_each_entry_reverse(page, &pcp->list, lru)
+				if (page_private(page) == migratetype)
+					break;
+		} else {
+			list_for_each_entry(page, &pcp->list, lru)
+				if (page_private(page) == migratetype)
+					break;
+		}
 
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
@@ -1863,12 +1870,9 @@ void show_free_areas(void)
 
 			pageset = zone_pcp(zone, cpu);
 
-			printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
-			       "Cold: hi:%5d, btch:%4d usd:%4d\n",
-			       cpu, pageset->pcp[0].high,
-			       pageset->pcp[0].batch, pageset->pcp[0].count,
-			       pageset->pcp[1].high, pageset->pcp[1].batch,
-			       pageset->pcp[1].count);
+			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
+			       cpu, pageset->pcp.high,
+			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 
@@ -2670,17 +2674,11 @@ inline void setup_pageset(struct per_cpu
 
 	memset(p, 0, sizeof(*p));
 
-	pcp = &p->pcp[0];		/* hot */
+	pcp = &p->pcp;
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
-
-	pcp = &p->pcp[1];		/* cold*/
-	pcp->count = 0;
-	pcp->high = 2 * batch;
-	pcp->batch = max(1UL, batch/2);
-	INIT_LIST_HEAD(&pcp->list);
 }
 
 /*
@@ -2693,7 +2691,7 @@ static void setup_pagelist_highmark(stru
 {
 	struct per_cpu_pages *pcp;
 
-	pcp = &p->pcp[0]; /* hot list */
+	pcp = &p->pcp;
 	pcp->high = high;
 	pcp->batch = max(1UL, high/4);
 	if ((high/4) > (PAGE_SHIFT * 8))

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-14 19:52 [PATCH] Page allocator: Get rid of the list of cold pages Christoph Lameter
@ 2007-11-14 21:13 ` Christoph Lameter
  2007-11-16  0:27 ` Andrew Morton
  1 sibling, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-11-14 21:13 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, apw, Mel Gorman

Remove unused leftover variable.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/page_alloc.c |    1 -
 1 file changed, 1 deletion(-)

Index: linux-2.6.24-rc2-mm1/mm/page_alloc.c
===================================================================
--- linux-2.6.24-rc2-mm1.orig/mm/page_alloc.c	2007-11-14 13:12:02.256478626 -0800
+++ linux-2.6.24-rc2-mm1/mm/page_alloc.c	2007-11-14 13:12:16.557288771 -0800
@@ -910,7 +910,6 @@ static void drain_pages(unsigned int cpu
 {
 	unsigned long flags;
 	struct zone *zone;
-	int i;
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-14 19:52 [PATCH] Page allocator: Get rid of the list of cold pages Christoph Lameter
  2007-11-14 21:13 ` Christoph Lameter
@ 2007-11-16  0:27 ` Andrew Morton
  2007-11-21 22:20   ` Mel Gorman
  1 sibling, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2007-11-16  0:27 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-mm, apw, mel, Martin Bligh

On Wed, 14 Nov 2007 11:52:47 -0800 (PST)
Christoph Lameter <clameter@sgi.com> wrote:

> The discussion of the RFC for this and Mel's measurements indicate that 
> there may not be too much of a point left to having separate lists for 
> hot and cold pages (see http://marc.info/?t=119492914200001&r=1&w=2). I 
> think it is worth taking into mm for further testing. This version is 
> against 2.6.24-rc2-mm1.
> 
> 
> Page allocator: Get rid of the list of cold pages
> 
> We have repeatedly discussed if the cold pages still have a point. There is
> one way to join the two lists: Use a single list and put the cold pages at the
> end and the hot pages at the beginning. That way a single list can serve for
> both types of allocations.

Well.  The whole per-cpu-pages thing was a very marginal benefit - I
wibbled for months before merging it.  So the effects of simplifying the
lists will be hard to measure.

The test which per-cpu-pages helped most was one which sits in a loop
extending and truncating a file by 32k - per-cpu-pages sped that up by a
lot (3x, iirc) because with per-cpu-pages it's always getting the same
pages on each CPU and they're cache-hot.

<goes archeological for a bit>

OK, it's create-delete.c from ext3-tools, duplicated below.  It would be
nice if someone(tm) could check that this patch doesn't hurt this test.

I'd suggest running one instance per cpu with various values of "size".

/*
 */

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <time.h>
#include <sys/mman.h>
#include <sys/signal.h>
#include <sys/stat.h>

int verbose;
char *progname;

void usage(void)
{
	fprintf(stderr, "Usage: %s [-v] [-nN] [-s size] filename\n", progname);
	fprintf(stderr, "      -v:         Verbose\n"); 
	fprintf(stderr, "     -nN:         Run N iterations\n"); 
	fprintf(stderr, "     -s size:     Size of file\n"); 
	exit(1);
}

int main(int argc, char *argv[])
{
	int c;
	int fd;
	int niters = -1;
	int size = 16 * 4096;
	char *filename;
	char *buf;

	progname = argv[0];
	while ((c = getopt(argc, argv, "vn:s:")) != -1) {
		switch (c) {
		case 'n':
			niters = strtol(optarg, NULL, 10);
			break;
		case 's':
			size = strtol(optarg, NULL, 10);
			break;
		case 'v':
			verbose++;
			break;
		}
	}

	if (optind == argc)
		usage();
	filename = argv[optind++];
	if (optind != argc)
		usage();
	buf = malloc(size);
	if (buf == 0) {
		perror("nomem");
		exit(1);
	}
	fd = creat(filename, 0666);
	if (fd < 0) {
		perror("creat");
		exit(1);
	}
	while (niters--) {
		if (lseek(fd, 0, SEEK_SET)) {
			perror("lseek");
			exit(1);
		}
		if (write(fd, buf, size) != size) {
			perror("write");
			exit(1);
		}
		if (ftruncate(fd, 0)) {
			perror("ftruncate");
			exit(1);
		}
	}
	exit(0);
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-16  0:27 ` Andrew Morton
@ 2007-11-21 22:20   ` Mel Gorman
  2007-11-21 22:28     ` Christoph Lameter
                       ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Mel Gorman @ 2007-11-21 22:20 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Christoph Lameter, linux-mm, apw, Martin Bligh

On Wed, 14 Nov 2007 11:52:47 -0800 (PST)
Christoph Lameter <clameter@sgi.com> wrote:

> Well.  The whole per-cpu-pages thing was a very marginal benefit - I
> wibbled for months before merging it.  So the effects of simplifying the
> lists will be hard to measure.
> 

You were not joking.

> The test which per-cpu-pages helped most was one which sits in a loop
> extending and truncating a file by 32k - per-cpu-pages sped that up by a
> lot (3x, iirc) because with per-cpu-pages it's always getting the same
> pages on each CPU and they're cache-hot.

> <goes archeological for a bit>
> 
> OK, it's create-delete.c from ext3-tools, duplicated below.  It would be
> nice if someone(tm) could check that this patch doesn't hurt this test.
> 

It took me a while but I finally got around to taking a closer look at
this after I got the zonelist stuff out again.

> I'd suggest running one instance per cpu with various values of "size".

Good idea so I followed your suggestion. The end results could do with as
many set of eyes double-checking what I did. Andy Whitcroft took a read
for me in advance of this posting and did not spot any obvious insanity.
While I am reasonably confident the results are accurate and I measured the
right thing, the more the merrier.  The very short summary of what I found was;

1. In general, the split lists are faster than the combined list
2. Disabling Per-CPU has comparable performance to having the lists

Point 2 was certainly not expected!

This is a more detailed account of what I did so people can tear holes in the
methodology if they wish or rerun the tests. The aim was to show if per-CPU
allocator really helped scalability or not in the file extend/truncate
case. We know there would be contention on other semaphores so it was not
going to be a linear improvement. Ideally though, if one CPU took 800ms to
work on data, four CPUs would take 200ms.

My initial objectives for the test were

1. Run multiple instances
2. Instances would be bound to CPUs to avoid scheduler jitter
3. The same amount of file data would be generated regardless of CPU count

The ideal results would show

1. Linear improvements for number of CPUs
2. No difference between single PCPU lists and hot/cold lists
3. Clear improvement over no PCPU list

The test machine was elm3b6 from test.kernel.org. This is a 4-CPU x86_64 NUMA
machine with 8GiB of memory. The base kernel version was 2.6.24-rc2-mm1 with
some hotfixes applied. FWIW, I spent the time to write test harnesses around
this necessary to quickly automate new tests for future patches or machine
types. This should make it easier for me to re-run tests quickly although
the script below is all people really need to replicate the results. The
test on this particular machine did the following.

1. Create a range of filesizes from 16K to 8MB
2. Run multiple times with instances ranging from 1 to 12 CPUs
3. Fork off the number of requested instances
4. Each child runs one worker function to completion and exits
5. Parent waits for all the children to complete and exits
6. One file is created per instance
7. The sum total of the files created is the same regardless of instances
   - For example, if using a 16K file, using 1 instance will be 1 16K file.
     Running 4 instances would create 4 4K files and operate on them
     independently

Results
=======

The raw data, gnuplot scripts etc are available from
http://www.csn.ul.ie/~mel/postings/percpu-20071121/alldata.tar.gz

The titles of patches are

hotcold-pcplist: This is the vanilla PCPU allocator
single-pcplist-batch8: This is Christophs patch with pcp->high == 8*batch
	as suggested by Martin Bligh (I agreed with him that keeping lists
	the same size made sense)
no-pcplist: buffered_rmqueue() always calls the core allocator

1-4CPU Scaling: http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-1_4Scaling-fullrange.ps

First lets look if the standard allocator scales at all in this graph.
You should see that by the end of the test, the scalability is not
bad although not exactly linear. One CPU looks to be taking about
2.4 seconds there and 4 CPUS does the job in 0.9 - probably losing
out on the additional file creations. At the smaller sizes you can see in
http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-1_4Scaling-upto0.5MB.ps,
it is not scaling as well but it is happening.

1-Instance Graph: http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-1instance-fullrange.ps

This shows running on just one CPU the full range of pages. It looks at a
glance to me that single pcplist is slowest with the hotcold lists being
faster. The no-pcplist is faster again as you would expect because it has
less work to do and no scalability concerns.

Up to the 0.5MB mark which is about the size of the PCPU lists
in general, you can see the three kernels are comparable; obvious in
http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-1instance-upto0.5MB.ps.
In the last 2MB shown
http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-1instance-last2MB.ps,
no-PCPU is consistently faster, then hotcold with the combined list being
slower.  are marginally faster most of the time.

4-Instances Graph: http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-4instance-fullrange.ps

At this point, there should be one instance running on each CPU.  The results
are a lot more variable at a glance, but it is still pretty clear what the
trends are. The combined-list is noticably slower. The real shock here is
that there is no real difference between the combined lists and using no
PCPU list at all. For this reason alone, the benchmark script needs to be
looked at by another person. I am still running the tests on another machine
but the results there so far match.

12-Instances Graph: http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-12instance-fullrange.ps 

Now there are 3 instances running per CPU on the system. The combined list
is again slowest but the other two are interesting. This time, the hot/cold
lists have a noticable performance improvement over the no-PCPU kernel in
genreal. However, for the first 0.5MB, the combined list is winning with
the hot/cold lists faring worst. Towards the larger filesizes, the opposite
applies.

Conclusions
===========

Overall, the single list is slower than the split lists although seeing it in a
larger benchmark may be difficult. The biggest suprise by far is that disabling
the PCPU list altogether seemed to have comparable performance. Intuitively,
this makes no sense and means the benchmark code should be read over by a
second person to check for mistakes.

I cannot see the evidence of this 3x improvement around the 32K filesize
mark. It may be because my test is very different to what happened before,
I got something wrong or the per-CPU allocator is not as good as it used to
be and does not give out the same hot-pages all the time. I tried running
tests on 2.6.23 but the results of PCPU vs no-PCPU were comparable to
2.6.24-rc2-mm1 so it is not something that has changed very recently.

As it is, the single PCP list may need another revision or at least
more investigation to see why it slows so much in comparison to the split
lists. The more controversial question is why disabling PCP appeared to make
no difference in this test.

Any comments on the test or what could be done differently?

Notes
=====

o The fact the machine was NUMA might have skewed the results. I bound the CPU,
  but did not set nodemasks. Node-local policies should have been used. I have
  kicked off tests on bl6-13 which has 4 cores but non-NUMA. It'll be a long
  time before they complete though
o The timings are of the whole process and children creation. This means that
  we are measuring more than just the file creation. Most fine-grained timings
  could be collected if it was felt to be relevant
o The no-pcplist patch was crude. The PCPU structures were not actually removed.
  Just the function itself was butchered. The patch is at
  http://www.csn.ul.ie/~mel/postings/percpu-20071121/disable_pcp.patch

Benchmark script
================
#!/bin/bash
#
# This benchmark is based on create/delete test from ext3-tools. The objective
# of this is to check the benefit of the per-cpu allocator. At the time of
# writing, the hot/cold lists are being collapsed into one. This is required
# to see if there is any performance loss from doing that.
#

# Paths for results directory and the like
export SCRIPT=`basename $0 | sed -e 's/\./\\\./'`
export SCRIPTDIR=`echo $0 | sed -e "s/$SCRIPT//"`
CPUCOUNT=`grep -c processor /proc/cpuinfo`
FILENAME=
RESULT_DIR=$HOME/vmregressbench-`uname -r`/createdelete
EXTRA=

# The filesizes are set so that the number of allocations
# coming from each CPU steadily rises. The size of the
# actual stride is based on the number of running instances
LOW_FILESIZE=$((4096*$CPUCOUNT))
HIGH_FILESIZE=$((524288*4*$CPUCOUNT))
STRIDE_FILESIZE_PERCPU=4096

# Print usage of command
usage() {
  echo "bench-createdelete.sh"
  echo This script measures how well the allocator scales for small file
  echo creations and deletions
  echo
  echo "Usage: bench-createdelete.sh [options]"
  echo "    -f, --filename Filename prefix to use for test files"
  echo "    -r, --result   Result directory (default: $RESULT_DIR)"
  echo "    -e, --extra    String to append to result dir"
  echo "    -h, --help     Print this help message"
  echo
  exit 1
}

# Parse command line arguements
ARGS=`getopt -o hf:r:e:v: --long help,filename:,result:,extra:,vmr: -n bench-createdelete.sh -- "$@"`
eval set -- "$ARGS"
while true ; do
	case "$1" in
		-f|--filename)	export FILENAME="$2"; shift 2;;
		-r|--result)	export RESULT_DIR="$2"; shift 2;;
		-e|--extra)	export EXTRA="$2"; shift 2;;
		-h|--help)	usage;;
		*)		shift 1; break;;
	esac
done

# Build the test program that does all the work
SELF=$0
TESTPROGRAM=`mktemp`
LINECOUNT=`wc -l $SELF | awk '{print $1}'`
CSTART=`grep -n "BEGIN C FILE" $SELF | tail -1 | awk -F : '{print $1}'`
tail -$(($LINECOUNT-$CSTART)) $SELF > $TESTPROGRAM.c
gcc $TESTPROGRAM.c -o $TESTPROGRAM || exit 1

# Setup results directory
if [ "$EXTRA" != "" ]; then
  export EXTRA=-$EXTRA
fi
export RESULT_DIR=$RESULT_DIR$EXTRA
if [ -d "$RESULT_DIR" ]; then
	echo ERROR: Results dir $RESULT_DIR already exists
	exit 1
fi
mkdir -p $RESULT_DIR || exit

echo bench-createdelete
echo o Result directory $RESULT_DIR

# Setup the filename prefix to be used by the test program
if [ "$FILENAME" = "" ]; then
	FILENAME=`mktemp`
fi

# Run the actual test
MAXINSTANCES=$(($CPUCOUNT*3))
for NUMCPUS in `seq 1 $MAXINSTANCES`; do
	echo o Running with $NUMCPUS striding $STRIDE_FILESIZE
	STRIDE_FILESIZE=$(($STRIDE_FILESIZE_PERCPU*$NUMCPUS))
	for SIZE in `seq -f "%10.0f" $LOW_FILESIZE $STRIDE_FILESIZE $HIGH_FILESIZE`; do
		/usr/bin/time -f "$SIZE %e" $TESTPROGRAM	\
				-n 50				\
				-s $SIZE			\
				-i$NUMCPUS			\
				$FILENAME 2>> $RESULT_DIR/results.$NUMCPUS || exit 1
		tail -1 $RESULT_DIR/results.$NUMCPUS
	done
done

# Generate a simply gnuplot script for giggles
echo "set xrange [$LOW_FILESIZE:$HIGH_FILESIZE]" > $RESULT_DIR/gnuplot.script
echo -n "plot " 		>> $RESULT_DIR/gnuplot.script
for NUMCPUS in `seq 1 $MAXINSTANCES`; do
	echo -n "'results.$NUMCPUS' with lines" >> $RESULT_DIR/gnuplot.script
	if [ $NUMCPUS -ne $MAXINSTANCES ]; then
		echo -n ", " >> $RESULT_DIR/gnuplot.script
	fi
done
echo >> $RESULT_DIR/gnuplot.script

exit 0

==== BEGIN C FILE ====
/*
 * This is lifted straight from ext3 tools to test the trunaction of a file.
 * On the suggestion of Andrew Morton, this can be used as a micro-benchmark
 * of the Linux per-cpu allocator. Hence, it has been modified to run the
 * requested number of instances. If scaling properly, the completion times
 * should be the same if the number of instances is less than the number of
 * CPUs.
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sched.h>
#include <time.h>
#include <sys/mman.h>
#include <sys/signal.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>

int verbose;
char *progname;

void usage(void)
{
	fprintf(stderr, "Usage: %s [-v] [-nN] [-s size] filename-prefix\n", progname);
	fprintf(stderr, "      -v:         Verbose\n"); 
	fprintf(stderr, "     -nN:         Run N iterations\n"); 
	fprintf(stderr, "     -iN:         Run N instances simultaneously\n");
	fprintf(stderr, "     -s size:     Size of file\n"); 
	exit(1);
}

int numcpus(void)
{
	static int count = -1;
	cpu_set_t mask;

	if (count != -1)
		return count;

	/* Work it out for the first time */
	CPU_ZERO(&mask);
	count = 0;
	if (sched_getaffinity(getpid(), sizeof(mask), &mask) == -1) {
		perror("sched_getaffinity\n");
		exit(1);
	}

	while (CPU_ISSET(count, &mask))
		count++;

	return count;
}

/* This is the worker function doing all the work */
int createdelete(char *fileprefix, int size, int niters, int instance)
{
	char *buf, *filename;
	int length = strlen(fileprefix) + 6;
	int fd;
	cpu_set_t mask;

	/* Bind to one CPU */
	CPU_ZERO(&mask);
	CPU_SET(instance % numcpus(), &mask);
	if (sched_setaffinity(getpid(), sizeof(cpu_set_t), &mask) == -1) {
		perror("sched_setaffinity");
		exit(1);
	}

	/* Allocate the necessary buffers */
	filename = malloc(length);
	if (filename == 0) {
		perror("nomem");
		exit(1);
	}
	buf = malloc(size);
	if (buf == 0) {
		perror("nomem");
		exit(1);
	}

	/* Create the file for this instance */
	snprintf(filename, length, "%s-%d\n", fileprefix, instance);
	fd = creat(filename, 0666);
	if (fd < 0) {
		perror("creat");
		exit(1);
	}

	/* Lets get this show on the road */
	while (niters--) {
		if (lseek(fd, 0, SEEK_SET)) {
			perror("lseek");
			exit(1);
		}
		if (write(fd, buf, size) != size) {
			perror("write");
			exit(1);
		}
		if (ftruncate(fd, 0)) {
			perror("ftruncate");
			exit(1);
		}
	}

	exit(0);
}

int main(int argc, char *argv[])
{
	int c;
	int i;
	int ninstances = 1;
	int niters = -1;
	int size = 16 * 4096;
	char *filename;

	progname = argv[0];
	while ((c = getopt(argc, argv, "vn:s:i:")) != -1) {
		switch (c) {
		case 'n':
			niters = strtol(optarg, NULL, 10);
			break;
		case 's':
			size = strtol(optarg, NULL, 10);
			break;
		case 'i':
			ninstances = strtol(optarg, NULL, 10);
			break;
		case 'v':
			verbose++;
			break;
		}
	}

	if (optind == argc)
		usage();
	filename = argv[optind++];
	if (optind != argc)
		usage();

	/* fork off the number of required instances doing work */
	for (i = 0; i < ninstances; i++) {
		pid_t pid = fork();
		if (pid == -1) {
			perror("fork");
			exit(1);
		}

		if (pid == 0)
			createdelete(filename, size / ninstances, niters, i);
	}

	/* Wait for the children */
	for (i = 0; i < ninstances; i++) {
		pid_t pid = wait(NULL);
		if (pid == -1) {
			perror("wait");
			exit(1);
		}
	}

	exit(0);
}

-- 
-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 22:20   ` Mel Gorman
@ 2007-11-21 22:28     ` Christoph Lameter
  2007-11-21 22:54       ` Mel Gorman
  2007-11-21 22:39     ` Christoph Lameter
  2007-11-21 23:23     ` Andrew Morton
  2 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2007-11-21 22:28 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On Wed, 21 Nov 2007, Mel Gorman wrote:

> 1. In general, the split lists are faster than the combined list
> 2. Disabling Per-CPU has comparable performance to having the lists

That is only true for the single threaded case (actually I am measuring a 
slight performance benefit if I switch them off). If you have multiple 
processes allocating from the same zone then you can get the zone locks 
hot. That was the reason for the recent regression in SLUB. The networking
layer went from an order 0 alloc to order 1. Zonelock contention then
dropped performance by 50% on an 8p! The potential for lock contention is 
higher the more processor per nodeare involved. So you are not going to 
see this as high on a standard NUMA config with 2p per node.

The main point at this juncture of the pcp lists seems to be avoiding 
zone lock contention! The overhead of extracting a page from the buddy 
lists is not such a problem.

> single-pcplist-batch8: This is Christophs patch with pcp->high == 8*batch
> 	as suggested by Martin Bligh (I agreed with him that keeping lists
> 	the same size made sense)

Ack.

I have not had a look at the details of your performance measurements yet. 
More later.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 22:20   ` Mel Gorman
  2007-11-21 22:28     ` Christoph Lameter
@ 2007-11-21 22:39     ` Christoph Lameter
  2007-11-21 23:00       ` Mel Gorman
  2007-11-21 23:23     ` Andrew Morton
  2 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2007-11-21 22:39 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On Wed, 21 Nov 2007, Mel Gorman wrote:

> Overall, the single list is slower than the split lists although seeing it in a
> larger benchmark may be difficult. The biggest suprise by far is that disabling
> the PCPU list altogether seemed to have comparable performance. Intuitively,
> this makes no sense and means the benchmark code should be read over by a
> second person to check for mistakes.
> 
> I cannot see the evidence of this 3x improvement around the 32K filesize
> mark. It may be because my test is very different to what happened before,
> I got something wrong or the per-CPU allocator is not as good as it used to
> be and does not give out the same hot-pages all the time. I tried running
> tests on 2.6.23 but the results of PCPU vs no-PCPU were comparable to
> 2.6.24-rc2-mm1 so it is not something that has changed very recently.
> 
> As it is, the single PCP list may need another revision or at least
> more investigation to see why it slows so much in comparison to the split
> lists. The more controversial question is why disabling PCP appeared to make
> no difference in this test.

The disabling of PCPs is for us (SGI) a performance benefit for certain 
loads and we have seen this in tests about 2 years ago.

I sure wish to know why the single PCP list is not that efficient. Could 
you simply remove the cold handling and put all pages always at the front 
and always allocate from the front? Maybe it is the additional list 
handling overhead that makes the difference.

> Any comments on the test or what could be done differently?

1) Could you label the axis? Its a bit difficult to see what you exactly 
are measuring there.

2) it may be useful to do these tests with anonymous pages because the 
file handling paths are rather slow and you may not hit zone lock 
contention because there are other things in the way (radix tree?)
 
> o The fact the machine was NUMA might have skewed the results. I bound the CPU,
>   but did not set nodemasks. Node-local policies should have been used. I have
>   kicked off tests on bl6-13 which has 4 cores but non-NUMA. It'll be a long
>   time before they complete though

The ratio of processors per node is important to see the lock contention 
here. Or run an SMP system with lots of processors.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 22:28     ` Christoph Lameter
@ 2007-11-21 22:54       ` Mel Gorman
  0 siblings, 0 replies; 16+ messages in thread
From: Mel Gorman @ 2007-11-21 22:54 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On (21/11/07 14:28), Christoph Lameter didst pronounce:
> On Wed, 21 Nov 2007, Mel Gorman wrote:
> 
> > 1. In general, the split lists are faster than the combined list
> > 2. Disabling Per-CPU has comparable performance to having the lists
> 
> That is only true for the single threaded case (actually I am measuring a 
> slight performance benefit if I switch them off). If you have multiple 
> processes allocating from the same zone then you can get the zone locks 
> hot.

um, I thought I went through this but I didn't just test single-threaded
and you will see that the test C program forks children to do the
work. 1instances is single process doing the work. 4instance graphs are
4 processes simultaneously doing the work (1 per CPU) and they showed
comparable performance of split lists vs no-PCP lits. They are also bound
to one CPU in an effort to maximise the use of the PCPU lists.  There was
some evidence this was beginning to change when 12 instances (3 per CPU)
were running but I hadn't setup the test to run with more.

> That was the reason for the recent regression in SLUB. The networking
> layer went from an order 0 alloc to order 1. Zonelock contention then
> dropped performance by 50% on an 8p! The potential for lock contention is 
> higher the more processor per nodeare involved. So you are not going to 
> see this as high on a standard NUMA config with 2p per node.
> 

Ok. I've queued the test to re-run on a 16-way x86_64 machine non-NUMA
machine and an 8-way PPC64 2-node-NUMA machine. I haven't worked on this
machines before but hopefully they'll run to completion.

> The main point at this juncture of the pcp lists seems to be avoiding 
> zone lock contention!

I get that. I was suprised with the results too and leads me to wonder if
the lock is being avoided elsewhere (quicklists or slab per-cpu lists maybe)
or if there was a filesystem lock so big, it doesn't matter what the PCPU
allocator is doing. I don't have other profile data available.

> The overhead of extracting a page from the buddy 
> lists is not such a problem.
> 

Ok, the higher-CPU machines may show the zone-lock contention. It could also
be a case that file extend/truncate is not the right thing to be doing either
for these measurements. Read the code and see what you think.

> > single-pcplist-batch8: This is Christophs patch with pcp->high == 8*batch
> > 	as suggested by Martin Bligh (I agreed with him that keeping lists
> > 	the same size made sense)
> 
> Ack.
> 
> I have not had a look at the details of your performance measurements yet. 
> More later.
> 

Thanks.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 22:39     ` Christoph Lameter
@ 2007-11-21 23:00       ` Mel Gorman
  2007-11-21 23:29         ` Christoph Lameter
  2007-11-21 23:34         ` Christoph Lameter
  0 siblings, 2 replies; 16+ messages in thread
From: Mel Gorman @ 2007-11-21 23:00 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On (21/11/07 14:39), Christoph Lameter didst pronounce:
> On Wed, 21 Nov 2007, Mel Gorman wrote:
> 
> > Overall, the single list is slower than the split lists although seeing it in a
> > larger benchmark may be difficult. The biggest suprise by far is that disabling
> > the PCPU list altogether seemed to have comparable performance. Intuitively,
> > this makes no sense and means the benchmark code should be read over by a
> > second person to check for mistakes.
> > 
> > I cannot see the evidence of this 3x improvement around the 32K filesize
> > mark. It may be because my test is very different to what happened before,
> > I got something wrong or the per-CPU allocator is not as good as it used to
> > be and does not give out the same hot-pages all the time. I tried running
> > tests on 2.6.23 but the results of PCPU vs no-PCPU were comparable to
> > 2.6.24-rc2-mm1 so it is not something that has changed very recently.
> > 
> > As it is, the single PCP list may need another revision or at least
> > more investigation to see why it slows so much in comparison to the split
> > lists. The more controversial question is why disabling PCP appeared to make
> > no difference in this test.
> 
> The disabling of PCPs is for us (SGI) a performance benefit for certain 
> loads and we have seen this in tests about 2 years ago.
> 

Right, that would be consistent with what I've seen so far.

> I sure wish to know why the single PCP list is not that efficient. Could 
> you simply remove the cold handling and put all pages always at the front 
> and always allocate from the front?

I thought this would be a good idea too but in testing mode, I didn't
want to fiddle with patches much in case I unconsciously screwed it up.

> Maybe it is the additional list 
> handling overhead that makes the difference.
> 
> > Any comments on the test or what could be done differently?
> 
> 1) Could you label the axis? Its a bit difficult to see what you exactly 
> are measuring there.
> 

I can, but I've included all the data there too and the gnuplot scripts so
you can do more detailed analysis of the results too. This will double up
as checking my methodology to make see I have not made some other mistake.

The Y axis in all these graphs is time for the files (sizes on X axis)
to be created/deleted 50 times by all the children.

> 2) it may be useful to do these tests with anonymous pages because the 
> file handling paths are rather slow and you may not hit zone lock 
> contention because there are other things in the way (radix tree?)
>  

I suspected this too, but thought if I went with anonymous pages we would
just get hit with mmap_sem instead and the results would not be significantly
different. I had also considered creating the files on tmpfs. In the end
I decided the original investigation was a filesystem and was as good a
starting point as any.

> > o The fact the machine was NUMA might have skewed the results. I bound the CPU,
> >   but did not set nodemasks. Node-local policies should have been used. I have
> >   kicked off tests on bl6-13 which has 4 cores but non-NUMA. It'll be a long
> >   time before they complete though
> 
> The ratio of processors per node is important to see the lock contention 
> here. Or run an SMP system with lots of processors.
> 

Already queued up. Machines are busy and the tests take hours to run so
I won't be coming back with quick answers.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 22:20   ` Mel Gorman
  2007-11-21 22:28     ` Christoph Lameter
  2007-11-21 22:39     ` Christoph Lameter
@ 2007-11-21 23:23     ` Andrew Morton
  2007-11-21 23:51       ` Mel Gorman
  2 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2007-11-21 23:23 UTC (permalink / raw)
  To: Mel Gorman; +Cc: clameter, linux-mm, apw, mbligh

On Wed, 21 Nov 2007 22:20:59 +0000
Mel Gorman <mel@csn.ul.ie> wrote:

> I cannot see the evidence of this 3x improvement around the 32K filesize
> mark. It may be because my test is very different to what happened before,
> I got something wrong or the per-CPU allocator is not as good as it used to
> be and does not give out the same hot-pages all the time.

Could be that when you return a handful of pages to the page allocator
and then allocate a handful of pages, you get the same pages back.  But
that the page allocator wasn't doing that 4-5 years ago when that code
went in.

Of course, even if the page allocator is indeed doing this for us, you'd
still expect to see benefits from the per-cpu magazines when each CPU is
allocating and freeing a number of pages which is close to the size of
that CPU's L1 cache.  Because when the pages are going into and coming from
a shared-by-all-cpus pool, each CPU will often get pages which are hot in
a different cpu's L1.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 23:00       ` Mel Gorman
@ 2007-11-21 23:29         ` Christoph Lameter
  2007-11-21 23:34         ` Christoph Lameter
  1 sibling, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-11-21 23:29 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On Wed, 21 Nov 2007, Mel Gorman wrote:

> > 2) it may be useful to do these tests with anonymous pages because the 
> > file handling paths are rather slow and you may not hit zone lock 
> > contention because there are other things in the way (radix tree?)
> 
> I suspected this too, but thought if I went with anonymous pages we would
> just get hit with mmap_sem instead and the results would not be significantly
> different. I had also considered creating the files on tmpfs. In the end
> I decided the original investigation was a filesystem and was as good a
> starting point as any.

Well you would get a hot cacheline with the semaphore. Its taken as a read 
lock so its not a holdoff in contrast to the zone lock where we actually 
spin until its available. In my experience it takes longer for the mmap 
sem cacheline to become a problem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 23:00       ` Mel Gorman
  2007-11-21 23:29         ` Christoph Lameter
@ 2007-11-21 23:34         ` Christoph Lameter
  2007-11-21 23:58           ` Mel Gorman
  1 sibling, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2007-11-21 23:34 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On Wed, 21 Nov 2007, Mel Gorman wrote:

> I thought this would be a good idea too but in testing mode, I didn't
> want to fiddle with patches much in case I unconsciously screwed it up.

Okay here is a patch against the combining patch that just forgets about 
coldness:

---
 mm/page_alloc.c |   18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2007-11-21 15:33:14.993673533 -0800
+++ linux-2.6/mm/page_alloc.c	2007-11-21 15:33:20.697205473 -0800
@@ -991,10 +991,7 @@ static void fastcall free_hot_cold_page(
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
-	if (cold)
-		list_add_tail(&page->lru, &pcp->list);
-	else
-		list_add(&page->lru, &pcp->list);
+	list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
@@ -1043,7 +1040,6 @@ static struct page *buffered_rmqueue(str
 {
 	unsigned long flags;
 	struct page *page;
-	int cold = !!(gfp_flags & __GFP_COLD);
 	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 
@@ -1062,15 +1058,9 @@ again:
 		}
 
 		/* Find a page of the appropriate migrate type */
-		if (cold) {
-			list_for_each_entry_reverse(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		} else {
-			list_for_each_entry(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		}
+		list_for_each_entry(page, &pcp->list, lru)
+			if (page_private(page) == migratetype)
+				break;
 
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 23:23     ` Andrew Morton
@ 2007-11-21 23:51       ` Mel Gorman
  0 siblings, 0 replies; 16+ messages in thread
From: Mel Gorman @ 2007-11-21 23:51 UTC (permalink / raw)
  To: Andrew Morton; +Cc: clameter, linux-mm, apw, mbligh

On (21/11/07 15:23), Andrew Morton didst pronounce:
> On Wed, 21 Nov 2007 22:20:59 +0000
> Mel Gorman <mel@csn.ul.ie> wrote:
> 
> > I cannot see the evidence of this 3x improvement around the 32K filesize
> > mark. It may be because my test is very different to what happened before,
> > I got something wrong or the per-CPU allocator is not as good as it used to
> > be and does not give out the same hot-pages all the time.
> 
> Could be that when you return a handful of pages to the page allocator
> and then allocate a handful of pages, you get the same pages back.  But
> that the page allocator wasn't doing that 4-5 years ago when that code
> went in.
> 

Maybe.

> Of course, even if the page allocator is indeed doing this for us, you'd
> still expect to see benefits from the per-cpu magazines when each CPU is
> allocating and freeing a number of pages which is close to the size of
> that CPU's L1 cache.  Because when the pages are going into and coming from
> a shared-by-all-cpus pool, each CPU will often get pages which are hot in
> a different cpu's L1.
> 

I checked and I am not seeing any clear benefit around the size of the L1
cache (64K D-cache). It could be because the granularity of the time is
too low and the cost of zeroing the page is drowning everything else
out in this test.

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 23:34         ` Christoph Lameter
@ 2007-11-21 23:58           ` Mel Gorman
  2007-11-22  0:06             ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Mel Gorman @ 2007-11-21 23:58 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On (21/11/07 15:34), Christoph Lameter didst pronounce:
> On Wed, 21 Nov 2007, Mel Gorman wrote:
> 
> > I thought this would be a good idea too but in testing mode, I didn't
> > want to fiddle with patches much in case I unconsciously screwed it up.
> 
> Okay here is a patch against the combining patch that just forgets about 
> coldness:
> 

I didn't think you were going to roll a patch and had queued this
slightly more agressive version. I think it is a superset of what your
patch does.

diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.24-rc2-mm1-single_pcplist/include/linux/gfp.h linux-2.6.24-rc2-mm1-single_pcplist_noheat/include/linux/gfp.h
--- linux-2.6.24-rc2-mm1-single_pcplist/include/linux/gfp.h	2007-11-15 11:27:49.000000000 +0000
+++ linux-2.6.24-rc2-mm1-single_pcplist_noheat/include/linux/gfp.h	2007-11-21 23:38:29.000000000 +0000
@@ -221,7 +221,7 @@ extern unsigned long FASTCALL(get_zeroed
 extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
 extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
 extern void FASTCALL(free_hot_page(struct page *page));
-extern void FASTCALL(free_cold_page(struct page *page));
+#define free_cold_page(page) free_hot_page(page)
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.24-rc2-mm1-single_pcplist/mm/page_alloc.c linux-2.6.24-rc2-mm1-single_pcplist_noheat/mm/page_alloc.c
--- linux-2.6.24-rc2-mm1-single_pcplist/mm/page_alloc.c	2007-11-21 23:55:45.000000000 +0000
+++ linux-2.6.24-rc2-mm1-single_pcplist_noheat/mm/page_alloc.c	2007-11-21 23:07:00.000000000 +0000
@@ -910,7 +910,6 @@ static void drain_pages(unsigned int cpu
 {
 	unsigned long flags;
 	struct zone *zone;
-	int i;
 
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
@@ -984,7 +983,7 @@ void mark_free_pages(struct zone *zone)
 /*
  * Free a 0-order page
  */
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+void fastcall free_hot_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
@@ -1004,10 +1003,7 @@ static void fastcall free_hot_cold_page(
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
-	if (cold)
-		list_add_tail(&page->lru, &pcp->list);
-	else
-		list_add(&page->lru, &pcp->list);
+	list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
@@ -1018,16 +1014,6 @@ static void fastcall free_hot_cold_page(
 	put_cpu();
 }
 
-void fastcall free_hot_page(struct page *page)
-{
-	free_hot_cold_page(page, 0);
-}
-	
-void fastcall free_cold_page(struct page *page)
-{
-	free_hot_cold_page(page, 1);
-}
-
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -1056,7 +1042,6 @@ static struct page *buffered_rmqueue(str
 {
 	unsigned long flags;
 	struct page *page;
-	int cold = !!(gfp_flags & __GFP_COLD);
 	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 
@@ -1075,15 +1060,9 @@ again:
 		}
 
 		/* Find a page of the appropriate migrate type */
-		if (cold) {
-			list_for_each_entry_reverse(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		} else {
-			list_for_each_entry(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		}
+		list_for_each_entry(page, &pcp->list, lru)
+			if (page_private(page) == migratetype)
+				break;
 
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
@@ -1746,7 +1725,7 @@ void __pagevec_free(struct pagevec *pvec
 	int i = pagevec_count(pvec);
 
 	while (--i >= 0)
-		free_hot_cold_page(pvec->pages[i], pvec->cold);
+		free_hot_page(pvec->pages[i]);
 }
 
 fastcall void __free_pages(struct page *page, unsigned int order)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-21 23:58           ` Mel Gorman
@ 2007-11-22  0:06             ` Christoph Lameter
  2007-11-22  1:44               ` Mel Gorman
  0 siblings, 1 reply; 16+ messages in thread
From: Christoph Lameter @ 2007-11-22  0:06 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On Wed, 21 Nov 2007, Mel Gorman wrote:

> I didn't think you were going to roll a patch and had queued this
> slightly more agressive version. I think it is a superset of what your
> patch does.

Looks okay.

Also note that you can avoid mmap_sem cacheline bouncing by having 
separate address spaces. Forking a series of processes that then fault 
pages each into their own address space will usually do the trick.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-22  0:06             ` Christoph Lameter
@ 2007-11-22  1:44               ` Mel Gorman
  2007-11-22  2:20                 ` Christoph Lameter
  0 siblings, 1 reply; 16+ messages in thread
From: Mel Gorman @ 2007-11-22  1:44 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On (21/11/07 16:06), Christoph Lameter didst pronounce:
> On Wed, 21 Nov 2007, Mel Gorman wrote:
> 
> > I didn't think you were going to roll a patch and had queued this
> > slightly more agressive version. I think it is a superset of what your
> > patch does.
> 
> Looks okay.
> 

And the results were better as well. Running one instance per-CPU, the
joined lists ignoring temperature was marginally faster than no-PCPU or
the hotcold-PCPU up to 0.5MB which roughly corresponds to the some of L1
caches of the CPUs. At higher sizes, it starts to look slower but even
at 8MB files, it is by a much smaller amount. With list manuipulations,
it is about 0.3 seconds slower. With just the lists joined, it's 0.1
seconds and I think the patch could simplify the paths more than what we
have currently. The full graph is at

http://www.csn.ul.ie/~mel/postings/percpu-20071121/graph-elm3b6-4instance-fullrange-notemp.ps

> Also note that you can avoid mmap_sem cacheline bouncing by having 
> separate address spaces. Forking a series of processes that then fault 
> pages each into their own address space will usually do the trick.

The test is already forking for just that reason. I'll see what the results
look like for more CPUs before putting the time into modifying the test for
anonymous mmap()

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Page allocator: Get rid of the list of cold pages
  2007-11-22  1:44               ` Mel Gorman
@ 2007-11-22  2:20                 ` Christoph Lameter
  0 siblings, 0 replies; 16+ messages in thread
From: Christoph Lameter @ 2007-11-22  2:20 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, linux-mm, apw, Martin Bligh

On Thu, 22 Nov 2007, Mel Gorman wrote:

> And the results were better as well. Running one instance per-CPU, the
> joined lists ignoring temperature was marginally faster than no-PCPU or
> the hotcold-PCPU up to 0.5MB which roughly corresponds to the some of L1
> caches of the CPUs. At higher sizes, it starts to look slower but even
> at 8MB files, it is by a much smaller amount. With list manuipulations,
> it is about 0.3 seconds slower. With just the lists joined, it's 0.1
> seconds and I think the patch could simplify the paths more than what we
> have currently. The full graph is at

Hmmm... This sounds like we could improve the situation by just having 
single linked lists? The update effort is then much less.

Here is a matrix of page allocator performance (2.6.24-rc2 with 
sparsemem) done with my page allocator test from 
http://git.kernel.org/?p=linux/kernel/git/christoph/slab.git;a=log;h=tests

All tests are in cycles

Single thread testing
=====================
1. Repeatedly allocate then free test
1000 times alloc_page(,0) -> 616 cycles __free_pages(,0)-> 295 cycles
1000 times alloc_page(,1) -> 576 cycles __free_pages(,1)-> 341 cycles
1000 times alloc_page(,2) -> 712 cycles __free_pages(,2)-> 380 cycles
1000 times alloc_page(,3) -> 966 cycles __free_pages(,3)-> 467 cycles
1000 times alloc_page(,4) -> 1435 cycles __free_pages(,4)-> 662 cycles
1000 times alloc_page(,5) -> 2201 cycles __free_pages(,5)-> 1044 cycles
1000 times alloc_page(,6) -> 3770 cycles __free_pages(,6)-> 2550 cycles
1000 times alloc_page(,7) -> 6781 cycles __free_pages(,7)-> 7652 cycles
1000 times alloc_page(,8) -> 13592 cycles __free_pages(,8)-> 17999 cycles
1000 times alloc_page(,9) -> 27970 cycles __free_pages(,9)-> 36335 cycles
1000 times alloc_page(,10) -> 58586 cycles __free_pages(,10)-> 72323 cycles
2. alloc/free test
1000 times alloc( ,0)/free -> 349 cycles
1000 times alloc( ,1)/free -> 531 cycles
1000 times alloc( ,2)/free -> 571 cycles
1000 times alloc( ,3)/free -> 663 cycles
1000 times alloc( ,4)/free -> 853 cycles
1000 times alloc( ,5)/free -> 1220 cycles
1000 times alloc( ,6)/free -> 2092 cycles
1000 times alloc( ,7)/free -> 3640 cycles
1000 times alloc( ,8)/free -> 6524 cycles
1000 times alloc( ,9)/free -> 12421 cycles
1000 times alloc( ,10)/free -> 30197 cycles

This shows that actually order 1 allocations that bypass the pcp lists are 
faster! We save the overhead of extracting pages from the buddy lists and 
putting them into the pcp.

The alloc free tests shows that the pcp lists are effective when 
cache hot.

Concurrent allocs
=================
Page alloc N*alloc N*free(0): 0=8266/8635 1=9667/8129 2=8501/8585 3=9485/8129 4=7870/8635 5=9761/7957 6=7687/8456 7=9749/7681 Average=8873/8276
Page alloc N*alloc N*free(1): 0=28917/22006 1=30057/26753 2=28930/23925 
3=30099/26779 4=28845/23717 5=30166/26733 6=28250/23744 7=30149/26677 
Average=29427/25042
Page alloc N*alloc N*free(2): 0=25316/23430 1=28749/26527 2=24858/22929 
3=28804/26636 4=24871/23368 5=28496/26621 6=25188/22057 7=28730/26228 
Average=26877/24725
Page alloc N*alloc N*free(3): 0=22414/23618 1=26397/27478 2=22359/24237 
3=26413/27060 4=22328/24021 5=26098/27879 6=22391/23731 7=26322/27802 
Average=24340/25728
Page alloc N*alloc N*free(4): 0=24922/26358 1=28126/30480 2=24733/26177 
3=28267/30540 4=25016/25688 5=28150/30563 6=24938/24902 7=28247/30650 
Average=26550/28170
Page alloc N*alloc N*free(5): 0=25211/27315 1=29504/32577 2=25796/27681 
3=29565/32272 4=26056/26588 5=29471/32728 6=25967/26619 7=29447/32744 
Average=27627/29816

The difference between order and 1 shows that pcp lists are effective at 
reducing zone lru lock overhead. The difference is factor 3 at 8p.

----Fastpath---
Page N*(alloc free)(0): 0=363 1=360 2=379 3=363 4=362 5=363 6=363 7=360 
Average=364
Page N*(alloc free)(1): 0=41014 1=44448 2=40416 3=44367 4=40980 5=44411 
6=40760 7=44265 Average=42583
Page N*(alloc free)(2): 0=42686 1=45588 2=42202 3=45509 4=42733 5=45561 
6=42716 7=45485 Average=44060
Page N*(alloc free)(3): 0=40567 1=43556 2=39699 3=43404 4=40435 5=43274 
6=39614 7=43545 Average=41762
Page N*(alloc free)(4): 0=43310 1=45097 2=43326 3=45405 4=43219 5=45372 
6=42492 7=45378 Average=44200
Page N*(alloc free)(5): 0=42765 1=45370 2=42029 3=44979 4=42567 5=45336 
6=42929 7=45016 Average=43874

This is just allocating and freeing the same page all the time. Here the 
pcps are orders of magnitude faster.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2007-11-22  2:20 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-11-14 19:52 [PATCH] Page allocator: Get rid of the list of cold pages Christoph Lameter
2007-11-14 21:13 ` Christoph Lameter
2007-11-16  0:27 ` Andrew Morton
2007-11-21 22:20   ` Mel Gorman
2007-11-21 22:28     ` Christoph Lameter
2007-11-21 22:54       ` Mel Gorman
2007-11-21 22:39     ` Christoph Lameter
2007-11-21 23:00       ` Mel Gorman
2007-11-21 23:29         ` Christoph Lameter
2007-11-21 23:34         ` Christoph Lameter
2007-11-21 23:58           ` Mel Gorman
2007-11-22  0:06             ` Christoph Lameter
2007-11-22  1:44               ` Mel Gorman
2007-11-22  2:20                 ` Christoph Lameter
2007-11-21 23:23     ` Andrew Morton
2007-11-21 23:51       ` Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox