[PATCH] 0/0 Reducing fragmentation through better allocation

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] 0/0 Reducing fragmentation through better allocation
@ 2005-01-15 19:43 Mel Gorman
  2005-01-15 19:45 ` [PATCH] 1/2 " Mel Gorman
  2005-01-15 19:46 ` [PATCH] 2/2 Satisify high-order allocations with linear scan Mel Gorman
  0 siblings, 2 replies; 3+ messages in thread
From: Mel Gorman @ 2005-01-15 19:43 UTC (permalink / raw)
  To: Linux Memory Management List; +Cc: Linux Kernel Mailing List

Hi,

Two patches will be posted with this thread. They both aim to address the
problem of external fragmentation in the buddy allocator preventing high-order
allocations. The first patch is a modified buddy allocator which I believe to
be very stable. The second patch is a proof-of-concept patch that linearly
scans memory and frees pages to satisfy a high-order allocation. The second
patch is very experimental and could break some architectures.

I ran a series of stress tests against the standard 2.6.11-rc1 kernel,
2.6.11-rc1+modifiedallocator and 2.6.11-rc1+modifiedallocator+linearscanner.
The shell script I used is displayed below but broadly speaking the script;

1. Start updatedb running in the background
2. Load kernel modules that are allocate high-order blocks
3. Clean a kernel tree
4. Make 6 copies of the tree. As each copy finishes, a compile starts at -j4
5. Start compiling the primary tree
6. Sleep 3 minutes
7. Use the kernel module to attempt 160 times to allocate a 2^10 block of pages
    - note, it only attempts 160 times, no matter how often it succeeds
    - An allocation is attempted every 1/10th of a second

The results are;

Vanilla 2.6.11-rc1
Attempted allocations: 160
Success allocs:        3
Failed allocs:         157
% Success:            1

2.6.11-rc1 with modified allocator
Attempted allocations: 160
Success allocs:        81
Failed allocs:         79
% Success:            50

2.6.11-rc1 with modified allocator and linear scanner
Attempted allocations: 160
Success allocs:        122
Failed allocs:         38
% Success:            76

With the scanner, 9 of those allocations were directly serviced by the linear
scanner. However, I know that it would have freed up a lot more pages than
necessary while scanning.

Test script is as follows;
#!/bin/bash

updatedb &

RESULTS=/root/results-`uname -r`.txt
START=`date +%s`
echo Start date: `date`
echo Start date: `date` > $RESULTS
uname -a >> $RESULTS

insmod /usr/src/vmregress/src/core/vmregress_core.ko
insmod /usr/src/vmregress/src/test/highalloc.ko
if [ ! -e /proc/vmregress/test_highalloc ]; then
  echo High alloc proc test does not exist
  exit
fi

cd /usr/src/linux-2.6.9-mbuddy
make clean

for i in `seq 1 6`; do
  echo Copying and making copy-$i
  cd ..
  rm -rf linux-2.6.9-copy-$i
  cp -r linux-2.6.9-mbuddy linux-2.6.9-copy-$i
  cd linux-2.6.9-copy-$i
  make clean > /dev/null 2> /dev/null
  make -j4 bzImage > /dev/null 2> ../error-$i.txt &
done

echo Making primary
cd ../linux-2.6.9-mbuddy
make -j4 bzImage > /dev/null 2> ../error-primary.txt &
cd ..

sleep 180
echo Trying high alloc
echo Buddyinfo at start of test >> $RESULTS
cat /proc/buddyinfo >> $RESULTS

STARTALLOC=`date +%s`
echo 10 160 > /proc/vmregress/test_highalloc
ENDALLOC=`date +%s`
cat /proc/vmregress/test_highalloc >> $RESULTS
echo Duration alloctest: $(($ENDALLOC-$STARTALLOC)) >> $RESULTS

cat /proc/vmregress/test_highalloc
echo Buddyinfo at end of test >> $RESULTS
cat /proc/buddyinfo >> $RESULTS


echo Waiting in `pwd`
echo Waiting till bzImages in each place
while [ ! -e linux-2.6.9-mbuddy/arch/i386/boot/bzImage ]; do sleep 1; done
echo Finished: Primary

for i in `seq 1 6`; do
  echo Waiting for linux-2.6.9-copy-$i/arch/i386/boot/bzImage
  while [ ! -e linux-2.6.9-copy-$i/arch/i386/boot/bzImage ]; do sleep 1; done
  echo Finished: Copy $i
done

echo Recording buddyinfo stats
cat /proc/buddyinfo > before-delete
rm linux-2.6.9-copy* -rf
cat /proc/buddyinfo > after-delete

echo End date: `date`
END=`date +%s`
echo Duration: $(($END-$START))



The module that performs the allocation works by creating a proc entry and echoing
parameters to it. This is where the test is started

echo 10 160 > /proc/vmregress/test_highalloc

and says "Attempt 160 allocations of 2^10 blocks of pages". The results are read
afterwards by catting the same proc entry. The module is part of VMRegress and
not released yet due to lack-of-documentation. However, the source is here so
you get an idea of how it works exactly. The code you cannot see is mainly support
code like initialising the proc entry and functions like printp for writing to
a proc entry (I know, I should be using seq_printf, but this code was written
before seq_printf existing and I have not updated to match).


/*
 * highalloc - Test the allocation of high-order pages
 *
 * Mel Gorman 2005
 */

#include <linux/version.h>
#include <linux/config.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/uaccess.h>

/* Module specific */
#include <vmregress_core.h>
#include <procprint.h>
#include <linux/mmzone.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/spinlock.h>
#include <linux/highmem.h>
#include <asm/rmap.h>		/* Included only if available */
#include <vmr_mmzone.h>

#define MODULENAME "test_highalloc"
#define NUM_PROC_ENTRIES 1

/* Tests */
#define TEST_FAST 0
#define TEST_LOW  1
#define TEST_MIN  2
#define TEST_ZERO 3

static vmr_desc_t testinfo[] = {
	VMR_DESC_INIT(TEST_FAST, MODULENAME , vmr_read_proc, vmr_write_proc),
};

MODULE_AUTHOR("Mel Gorman <mel@csn.ul.ie>");
MODULE_DESCRIPTION("Test high order allocations");
MODULE_LICENSE("GPL");

/* Boolean to indicate whether to use gfp_highuser or not */
static int gfp_highuser;
MODULE_PARM(gfp_highuser, "i");
MODULE_PARM_DESC(gfp_highuser, "Set to 1 if gfp_highuser is to be used with alloc_pages");

/* GFP flags to use with __alloc_pages. defaults to GFP_USER */
unsigned int gfp_flags=GFP_USER;

/**
 * test_alloc_help - Print help message to proc buffer
 * @procentry: Which proc buffer to write to
 */
void test_alloc_help(int procentry) {
	vmrproc_openbuffer(&testinfo[procentry]);

	printp("%s%s\n\n", MODULENAME, testinfo[procentry].name);
	printp("To run test, run \n");
	printp("echo order number > /proc/vmregress/%s\n\n", MODULENAME);

	/* Set gfp_flags based on the module parameter */
	if (gfp_highuser) gfp_flags = GFP_HIGHUSER;

	vmrproc_closebuffer(&testinfo[procentry]);
}

/**
 *
 * test_alloc_runtest - Allocate and free a number of pages from a ZONE_NORMAL
 * @params: Parameters read from the proc entry
 * @argc:   Number of parameters actually entered
 * @procentry: Proc buffer to write to
 *
 * If pages is set to 0, pages will be allocated until the pages_high watermark
 * is hit
 * Returns
 * 0  on success
 * -1 on failure
 *
 */
int test_alloc_runtest(int *params, int argc, int procentry) {
	unsigned long order;		/* Order of pages */
	unsigned long numpages;		/* Number of pages to allocate */
	struct page **pages;		/* Pages that were allocated */
	unsigned long attempts=0;
	unsigned long alloced=0;
	unsigned long nextjiffies = jiffies;
	unsigned long lastjiffies = jiffies;
	unsigned long success=0;
	unsigned long fail=0;
	unsigned long resched_count=0;

	/* Get the parameters */
	order = params[0];
	numpages = params[1];

	/* Make sure a buffer is available */
	if (vmrproc_checkbuffer(testinfo[procentry])) BUG();
	vmrproc_openbuffer(&testinfo[procentry]);

	if (order < 0 || order > MAX_ORDER) {
		vmr_printk("Order request makes no sense\n");
		return -1;
	}

	if (numpages < 0) {
		vmr_printk("Number of pages makes no sense\n");
		return -1;
	}

	/*
	 * Allocate memory to store pointers to pages.
	 */
	pages = vmalloc((numpages+1) * sizeof(struct page **));

	/*
	 * Attempt to allocate the requested number of pages
	 */
	while (attempts++ != numpages) {
		struct page *page;
		if (lastjiffies > jiffies) nextjiffies = jiffies;
		while (jiffies < nextjiffies) check_resched(resched_count);
		nextjiffies = jiffies + (HZ / 10);

		/* Print message if this is taking a long time */
		if (jiffies - lastjiffies > HZ) {
			printk("High order alloc test attempts: %lu\n",
					attempts);
		}
		lastjiffies = jiffies;

		page = alloc_pages(gfp_flags, order);
		if (page) {
			success++;
			pages[alloced++] = page;
		} else {
			fail++;
		}
	}

	/*
	 * Free up the pages
	 */
	alloced--;
	do {
		__free_pages(pages[alloced], order);
		alloced--;
	} while (alloced != 0);

	/* Print header */
	printp("Order:                 %lu\n", order);
	printp("Attempted allocations: %lu\n", numpages);
	printp("Success allocs:        %lu\n", success);
	printp("Failed allocs:         %lu\n", fail);
	printp("%% Success:            %lu\n", (success * 100) / (unsigned long)numpages);

	vfree(pages);

	printp("Test completed successfully\n");

	vmrproc_closebuffer(&testinfo[procentry]);
	return 0;
}

#define NUMBER_PROC_WRITE_PARAMETERS 2
#define VMR_WRITE_CALLBACK test_alloc_runtest
#include "../init/proc.c"

#define VMR_HELP_PROVIDED test_alloc_help
#include "../init/init.c"


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] 1/2 Reducing fragmentation through better allocation
  2005-01-15 19:43 [PATCH] 0/0 Reducing fragmentation through better allocation Mel Gorman
@ 2005-01-15 19:45 ` Mel Gorman
  2005-01-15 19:46 ` [PATCH] 2/2 Satisify high-order allocations with linear scan Mel Gorman
  1 sibling, 0 replies; 3+ messages in thread
From: Mel Gorman @ 2005-01-15 19:45 UTC (permalink / raw)
  To: Linux Memory Management List; +Cc: Linux Kernel Mailing List

Changelog since V2

o Do not to interfere with the "min" decay
o Update the __GFP_BITS_SHIFT properly. Old value broke fsync and probably
  a lot more besides

This patch divides allocations into three different types of allocations;

UserReclaimable - These are userspace pages that are easily reclaimable. Right
	now, I'm putting all allocations of GFP_USER and GFP_HIGHUSER as
	well as disk-buffer pages into this category. These pages are trivially
	reclaimed by writing the page out to swap or syncing with backing
	storage

KernelReclaimable - These are pages allocated by the kernel that are easily
	reclaimed. This is stuff like inode caches, dcache, buffer_heads etc.
	These type of pages potentially could be reclaimed by dumping the
	caches and reaping the slabs (drastic, but you get the idea).

KernelNonReclaimable - These are pages that are allocated by the kernel that
	are not trivially reclaimed. For example, the memory allocated for a
	loaded module would be in this category. By default, allocations are
	considered to be of this type

Instead of having one global MAX_ORDER-sized array of free lists, there are
three, one for each type of allocation. Finally, there is a list of pages of
size 2^MAX_ORDER which is a global pool of the largest pages the kernel deals
with.

Once a 2^MAX_ORDER block of pages it split for a type of allocation, it is
added to the free-lists for that type, in effect reserving it. Hence, over
time, pages of the different types can be clustered together. This means that
if we wanted 2^MAX_ORDER number of pages, we could linearly scan a block of
pages allocated for UserReclaimable and page each of them out.

Fallback is used when there are no 2^MAX_ORDER pages available and there
are no free pages of the desired type. The fallback lists were chosen in a
way that keeps the most easily reclaimable pages together.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/buffer.c linux-2.6.11-rc1-mbuddy/fs/buffer.c
--- linux-2.6.11-rc1-clean/fs/buffer.c	2005-01-12 04:01:23.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/buffer.c	2005-01-13 10:56:30.000000000 +0000
@@ -1134,7 +1134,8 @@ grow_dev_page(struct block_device *bdev,
 	struct page *page;
 	struct buffer_head *bh;

-	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	page = find_or_create_page(inode->i_mapping, index,
+					GFP_NOFS | __GFP_USERRCLM);
 	if (!page)
 		return NULL;

@@ -2997,7 +2998,8 @@ static void recalc_bh_state(void)

 struct buffer_head *alloc_buffer_head(int gfp_flags)
 {
-	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+	struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+						   gfp_flags|__GFP_KERNRCLM);
 	if (ret) {
 		preempt_disable();
 		__get_cpu_var(bh_accounting).nr++;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/dcache.c linux-2.6.11-rc1-mbuddy/fs/dcache.c
--- linux-2.6.11-rc1-clean/fs/dcache.c	2005-01-12 04:00:09.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/dcache.c	2005-01-13 10:56:30.000000000 +0000
@@ -715,7 +715,8 @@ struct dentry *d_alloc(struct dentry * p
 	struct dentry *dentry;
 	char *dname;

-	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+	dentry = kmem_cache_alloc(dentry_cache,
+			  	  GFP_KERNEL|__GFP_KERNRCLM);
 	if (!dentry)
 		return NULL;

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ext2/super.c linux-2.6.11-rc1-mbuddy/fs/ext2/super.c
--- linux-2.6.11-rc1-clean/fs/ext2/super.c	2005-01-12 04:01:24.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ext2/super.c	2005-01-13 10:56:30.000000000 +0000
@@ -137,7 +137,7 @@ static kmem_cache_t * ext2_inode_cachep;
 static struct inode *ext2_alloc_inode(struct super_block *sb)
 {
 	struct ext2_inode_info *ei;
-	ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+	ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL|__GFP_KERNRCLM);
 	if (!ei)
 		return NULL;
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ext3/super.c linux-2.6.11-rc1-mbuddy/fs/ext3/super.c
--- linux-2.6.11-rc1-clean/fs/ext3/super.c	2005-01-12 04:02:11.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ext3/super.c	2005-01-13 10:56:30.000000000 +0000
@@ -434,7 +434,7 @@ static struct inode *ext3_alloc_inode(st
 {
 	struct ext3_inode_info *ei;

-	ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+	ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
 	if (!ei)
 		return NULL;
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ntfs/inode.c linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c
--- linux-2.6.11-rc1-clean/fs/ntfs/inode.c	2005-01-12 04:01:45.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c	2005-01-13 10:56:30.000000000 +0000
@@ -318,7 +318,7 @@ struct inode *ntfs_alloc_big_inode(struc

 	ntfs_debug("Entering.");
 	ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
-			SLAB_NOFS);
+			SLAB_NOFS|__GFP_KERNRCLM);
 	if (likely(ni != NULL)) {
 		ni->state = 0;
 		return VFS_I(ni);
@@ -343,7 +343,8 @@ static inline ntfs_inode *ntfs_alloc_ext
 	ntfs_inode *ni;

 	ntfs_debug("Entering.");
-	ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+	ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache,
+					    SLAB_NOFS|__GFP_KERNRCLM);
 	if (likely(ni != NULL)) {
 		ni->state = 0;
 		return ni;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/include/linux/gfp.h linux-2.6.11-rc1-mbuddy/include/linux/gfp.h
--- linux-2.6.11-rc1-clean/include/linux/gfp.h	2005-01-12 04:00:35.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/include/linux/gfp.h	2005-01-15 18:16:47.000000000 +0000
@@ -38,21 +38,24 @@ struct vm_area_struct;
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
 #define __GFP_COMP	0x4000	/* Add compound page metadata */
 #define __GFP_ZERO	0x8000	/* Return zeroed page on success */
+#define __GFP_KERNRCLM  0x10000 /* Kernel page that is easily reclaimable */
+#define __GFP_USERRCLM  0x20000 /* User is a userspace user */

-#define __GFP_BITS_SHIFT 16	/* Room for 16 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 18	/* Room for 18 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)

 /* if you forget to add the bitmask here kernel will crash, period */
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
-			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
+			__GFP_USERRCLM|__GFP_KERNRCLM)

 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
 #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USERRCLM)
+#define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_USERRCLM)

 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/include/linux/mmzone.h linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h
--- linux-2.6.11-rc1-clean/include/linux/mmzone.h	2005-01-12 04:01:17.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h	2005-01-13 14:24:27.000000000 +0000
@@ -19,6 +19,10 @@
 #else
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
+#define ALLOC_TYPES 3
+#define ALLOC_KERNNORCLM 0
+#define ALLOC_KERNRCLM 1
+#define ALLOC_USERRCLM 2

 struct free_area {
 	struct list_head	free_list;
@@ -131,8 +135,37 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
-	struct free_area	free_area[MAX_ORDER];

+  	/*
+	 * There are ALLOC_TYPE number of MAX_ORDER free lists. Once a
+	 * MAX_ORDER block of pages has been split for an allocation type,
+	 * the whole block is reserved for that type of allocation. The
+	 * types are User Reclaimable, Kernel Reclaimable and Kernel
+	 * Non-reclaimable. The objective is to reduce fragmentation
+	 * overall
+  	 */
+ 	struct free_area	free_area_lists[ALLOC_TYPES][MAX_ORDER];
+
+	/*
+	 * This is a list of page blocks of 2^MAX_ORDER. Once one of
+	 * these are split, the buddy is added to the appropriate
+	 * free_area_lists. When the buddies are later merged, they
+	 * are placed back here
+	 */
+ 	struct free_area	free_area_global;
+
+ 	/*
+ 	 * This map tracks what each 2^MAX_ORDER sized block has been used for.
+	 * Each 2^MAX_ORDER block have pages has 2 bits in this map to remember
+	 * what the block is for. When a page is freed, it's index within this
+	 * bitmap is calculated using (address >> MAX_ORDER) * 2 . This means
+	 * that pages will always be freed into the correct list in
+	 * free_area_lists
+	 *
+	 * The bits are set when a 2^MAX_ORDER block of pages is split
+ 	 */
+
+ 	unsigned long		*free_area_usemap;

 	ZONE_PADDING(_pad1_)

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/mm/page_alloc.c linux-2.6.11-rc1-mbuddy/mm/page_alloc.c
--- linux-2.6.11-rc1-clean/mm/page_alloc.c	2005-01-12 04:00:02.000000000 +0000
+++ linux-2.6.11-rc1-mbuddy/mm/page_alloc.c	2005-01-15 18:10:54.000000000 +0000
@@ -46,9 +46,30 @@ unsigned long totalhigh_pages;
 long nr_swap_pages;
 int sysctl_lower_zone_protection = 0;

+/* Bean counters for the per-type buddy allocator */
+int fallback_count[ALLOC_TYPES] = { 0, 0, 0};
+int global_steal=0;
+int global_refill=0;
+int kernnorclm_count=0;
+int kernrclm_count=0;
+int userrclm_count=0;
+
 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);

+/**
+ * The allocator tries to put allocations of the same type in the
+ * same 2^MAX_ORDER blocks of pages. When memory is low, this may
+ * not be possible so this describes what order they should fall
+ * back on
+ */
+int fallback_allocs[ALLOC_TYPES][ALLOC_TYPES] = {
+       { ALLOC_KERNNORCLM, ALLOC_KERNRCLM,   ALLOC_USERRCLM },
+       { ALLOC_KERNRCLM,   ALLOC_KERNNORCLM, ALLOC_USERRCLM },
+       { ALLOC_USERRCLM,   ALLOC_KERNNORCLM, ALLOC_KERNRCLM }
+};
+
+
 /*
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
@@ -57,6 +78,7 @@ struct zone *zone_table[1 << (ZONES_SHIF
 EXPORT_SYMBOL(zone_table);

 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *type_names[ALLOC_TYPES] = { "KernNoRclm", "KernRclm", "UserRclm"};
 int min_free_kbytes = 1024;

 unsigned long __initdata nr_kernel_pages;
@@ -103,6 +125,48 @@ static void bad_page(const char *functio
 	tainted |= TAINT_BAD_PAGE;
 }

+/*
+ * Return what type of use the 2^MAX_ORDER block of pages is in use for
+ * that the given page is part of
+ */
+static int get_pageblock_type(struct page *page) {
+	struct zone *zone = page_zone(page);
+	int pageidx = (page - zone->zone_mem_map) >> MAX_ORDER;
+	int bitidx = pageidx * 2;
+
+	/* Bit 1 will be set if the block is kernel reclaimable */
+	if (test_bit(bitidx,zone->free_area_usemap)) return ALLOC_KERNRCLM;
+
+	/* Bit 2 will be set if the block is user reclaimable */
+	if (test_bit(bitidx+1, zone->free_area_usemap)) return ALLOC_USERRCLM;
+
+	return ALLOC_KERNNORCLM;
+}
+
+static void set_pageblock_type(struct page *page, int type) {
+	int bit1, bit2;
+	struct zone *zone = page_zone(page);
+	int pageidx = (page - zone->zone_mem_map) >> MAX_ORDER;
+	int bitidx = pageidx * 2;
+	bit1 = bit2 = 0;
+
+	if (type == ALLOC_KERNRCLM) {
+		set_bit(bitidx, zone->free_area_usemap);
+		clear_bit(bitidx+1, zone->free_area_usemap);
+		return;
+	}
+
+	if (type == ALLOC_USERRCLM) {
+		clear_bit(bitidx, zone->free_area_usemap);
+		set_bit(bitidx+1, zone->free_area_usemap);
+		return;
+	}
+
+	clear_bit(bitidx, zone->free_area_usemap);
+	clear_bit(bitidx+1, zone->free_area_usemap);
+
+}
+
 #ifndef CONFIG_HUGETLB_PAGE
 #define prep_compound_page(page, order) do { } while (0)
 #define destroy_compound_page(page, order) do { } while (0)
@@ -231,6 +295,7 @@ static inline void __free_pages_bulk (st
 	unsigned long page_idx;
 	struct page *coalesced;
 	int order_size = 1 << order;
+	struct free_area *area;

 	if (unlikely(order))
 		destroy_compound_page(page, order);
@@ -240,9 +305,12 @@ static inline void __free_pages_bulk (st
 	BUG_ON(page_idx & (order_size - 1));
 	BUG_ON(bad_range(zone, page));

+	/* Select the area to use for freeing based on the type */
+	struct free_area *freelist =
+		zone->free_area_lists[get_pageblock_type(page)];
+
 	zone->free_pages += order_size;
 	while (order < MAX_ORDER-1) {
-		struct free_area *area;
 		struct page *buddy;
 		int buddy_idx;

@@ -254,16 +322,29 @@ static inline void __free_pages_bulk (st
 			break;
 		/* Move the buddy up one level. */
 		list_del(&buddy->lru);
-		area = zone->free_area + order;
+		area = freelist + order;
 		area->nr_free--;
 		rmv_page_order(buddy);
 		page_idx &= buddy_idx;
 		order++;
 	}
+
+	/*
+	 * If a MAX_ORDER block of pages is being freed, it is
+	 * no longer reserved for a particular type of allocation
+	 * so put it in the global list
+	 */
+	if (order >= MAX_ORDER-1) {
+		area = &(zone->free_area_global);
+		global_refill++;
+	} else {
+		area = freelist + order;
+	}
+
 	coalesced = base + page_idx;
 	set_page_order(coalesced, order);
-	list_add(&coalesced->lru, &zone->free_area[order].free_list);
-	zone->free_area[order].nr_free++;
+	list_add(&coalesced->lru, &area->free_list);
+	area->nr_free++;
 }

 static inline void free_pages_check(const char *function, struct page *page)
@@ -310,6 +391,7 @@ free_pages_bulk(struct zone *zone, int c
 	zone->pages_scanned = 0;
 	while (!list_empty(list) && count--) {
 		page = list_entry(list->prev, struct page, lru);
+
 		/* have to delete it as __free_pages_bulk list manipulates */
 		list_del(&page->lru);
 		__free_pages_bulk(page, base, zone, order);
@@ -420,14 +502,36 @@ static void prep_new_page(struct page *p
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static struct page *__rmqueue(struct zone *zone, unsigned int order, int flags)
 {
 	struct free_area * area;
 	unsigned int current_order;
 	struct page *page;
+	int global_split=0;
+
+	/* Select area to use based on gfp_flags */
+	int alloctype;
+	int retry_count=0;
+	if (flags & __GFP_USERRCLM) {
+		alloctype = ALLOC_USERRCLM;
+		userrclm_count++;
+	}
+	else if (flags & __GFP_KERNRCLM) {
+		alloctype = ALLOC_KERNRCLM;
+		kernrclm_count++;
+	} else {
+		alloctype = ALLOC_KERNNORCLM;
+		kernnorclm_count++;
+	}

+	/* Ok, pick the fallback order based on the type */
+	int *fallback_list = fallback_allocs[alloctype];
+
+retry:
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-		area = zone->free_area + current_order;
+		alloctype = fallback_list[retry_count];
+		area = zone->free_area_lists[alloctype] + current_order;
+
 		if (list_empty(&area->free_list))
 			continue;

@@ -439,6 +543,34 @@ static struct page *__rmqueue(struct zon
 		return expand(zone, page, order, current_order, area);
 	}

+	/* Take from the global pool if this is the first attempt */
+	if (!global_split && !list_empty(&(zone->free_area_global.free_list))){
+		/*
+		 * Remove a MAX_ORDER block from the global pool and add
+		 * it to the list of desired alloc_type
+		 */
+		page = list_entry(zone->free_area_global.free_list.next,
+				  struct page, lru);
+		list_del(&page->lru);
+		list_add(&page->lru,
+			&(zone->free_area_lists[alloctype][MAX_ORDER-1].free_list));
+		global_steal++;
+		global_split=1;
+
+		/* Mark this block of pages as for use with this alloc type */
+		set_pageblock_type(page, alloctype);
+
+		goto retry;
+	}
+
+	/*
+	 * Here, the alloc type lists has been depleted as well as the global
+	 * pool, so fallback
+	 */
+	retry_count++;
+	fallback_count[alloctype]++;
+	if (retry_count != ALLOC_TYPES) goto retry;
+
 	return NULL;
 }

@@ -448,7 +580,8 @@ static struct page *__rmqueue(struct zon
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
-			unsigned long count, struct list_head *list)
+			unsigned long count, struct list_head *list,
+			int gfp_flags)
 {
 	unsigned long flags;
 	int i;
@@ -457,7 +590,7 @@ static int rmqueue_bulk(struct zone *zon

 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
-		page = __rmqueue(zone, order);
+		page = __rmqueue(zone, order, gfp_flags);
 		if (page == NULL)
 			break;
 		allocated++;
@@ -493,7 +626,7 @@ static void __drain_pages(unsigned int c
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long zone_pfn, flags;
-	int order;
+	int order, type;
 	struct list_head *curr;

 	if (!zone->spanned_pages)
@@ -503,14 +636,17 @@ void mark_free_pages(struct zone *zone)
 	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
 		ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));

-	for (order = MAX_ORDER - 1; order >= 0; --order)
-		list_for_each(curr, &zone->free_area[order].free_list) {
-			unsigned long start_pfn, i;
+	for (type=0; type < ALLOC_TYPES; type++) {

-			start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
+		for (order = MAX_ORDER - 1; order >= 0; --order)
+			list_for_each(curr, &zone->free_area_lists[type][order].free_list) {
+				unsigned long start_pfn, i;
+
+				start_pfn = page_to_pfn(list_entry(curr, struct page, lru));

-			for (i=0; i < (1<<order); i++)
-				SetPageNosaveFree(pfn_to_page(start_pfn+i));
+				for (i=0; i < (1<<order); i++)
+					SetPageNosaveFree(pfn_to_page(start_pfn+i));
+		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
@@ -612,14 +748,15 @@ buffered_rmqueue(struct zone *zone, int
 	struct page *page = NULL;
 	int cold = !!(gfp_flags & __GFP_COLD);

-	if (order == 0) {
+	if (order == 0 && (gfp_flags & __GFP_USERRCLM)) {
 		struct per_cpu_pages *pcp;

 		pcp = &zone->pageset[get_cpu()].pcp[cold];
 		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
-						pcp->batch, &pcp->list);
+						pcp->batch, &pcp->list,
+						gfp_flags);
 		if (pcp->count) {
 			page = list_entry(pcp->list.next, struct page, lru);
 			list_del(&page->lru);
@@ -631,7 +768,7 @@ buffered_rmqueue(struct zone *zone, int

 	if (page == NULL) {
 		spin_lock_irqsave(&zone->lock, flags);
-		page = __rmqueue(zone, order);
+		page = __rmqueue(zone, order, gfp_flags);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}

@@ -669,7 +806,11 @@ int zone_watermark_ok(struct zone *z, in
 		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
-		free_pages -= z->free_area[o].nr_free << o;
+		free_pages -= (
+			z->free_area_lists[ALLOC_KERNNORCLM][o].nr_free +
+			z->free_area_lists[ALLOC_KERNRCLM][o].nr_free +
+			z->free_area_lists[ALLOC_USERRCLM][o].nr_free
+			) << o;

 		/* Require fewer higher order pages to be free */
 		min >>= 1;
@@ -1124,6 +1265,7 @@ void show_free_areas(void)
 	unsigned long inactive;
 	unsigned long free;
 	struct zone *zone;
+	int type;

 	for_each_zone(zone) {
 		show_node(zone);
@@ -1216,8 +1358,10 @@ void show_free_areas(void)

 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
-			nr = zone->free_area[order].nr_free;
-			total += nr << order;
+			for (type=0; type < ALLOC_TYPES; type++) {
+				nr = zone->free_area_lists[type][order].nr_free;
+				total += nr << order;
+			}
 			printk("%lu*%lukB ", nr, K(1UL) << order);
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
@@ -1515,10 +1659,22 @@ void zone_init_free_lists(struct pglist_
 				unsigned long size)
 {
 	int order;
-	for (order = 0; order < MAX_ORDER ; order++) {
-		INIT_LIST_HEAD(&zone->free_area[order].free_list);
-		zone->free_area[order].nr_free = 0;
+ 	int type;
+ 	struct free_area *area;
+
+ 	/* Initialse the three size ordered lists of free_areas */
+	for (type=0; type < ALLOC_TYPES; type++) {
+		for (order = 0; order < MAX_ORDER; order++) {
+			area = zone->free_area_lists[type];
+
+			INIT_LIST_HEAD(&area[order].free_list);
+			area[order].nr_free = 0;
+		}
 	}
+
+ 	/* Initialise the global pool of 2^size pages */
+ 	INIT_LIST_HEAD(&zone->free_area_global.free_list);
+	zone->free_area_global.nr_free=0;
 }

 #ifndef __HAVE_ARCH_MEMMAP_INIT
@@ -1539,6 +1695,7 @@ static void __init free_area_init_core(s
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 	int cpu, nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
+	unsigned long usemapsize;

 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
@@ -1637,6 +1794,22 @@ static void __init free_area_init_core(s
 		zone_start_pfn += size;

 		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+		/* Calculate size of required bitmap */
+		/* - Number of MAX_ORDER blocks in the zone */
+		usemapsize = (size + (1 << MAX_ORDER)) >> MAX_ORDER;
+
+		/* - Two bits to record what type of block it is */
+		usemapsize = (usemapsize * 2 + 8) / 8;
+
+		zone->free_area_usemap =
+			(unsigned long *)alloc_bootmem_node(pgdat, usemapsize);
+
+		memset((unsigned long *)zone->free_area_usemap,
+				ALLOC_KERNNORCLM, usemapsize);
+
+		printk(KERN_DEBUG "  %s zone: %lu pages, %lu real pages, usemap size:%lu\n",
+				zone_names[j], size, realsize, usemapsize);
 	}
 }

@@ -1714,19 +1887,90 @@ static int frag_show(struct seq_file *m,
 	struct zone *zone;
 	struct zone *node_zones = pgdat->node_zones;
 	unsigned long flags;
-	int order;
+	int order, type;
+	struct list_head *elem;

-	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!zone->present_pages)
-			continue;

-		spin_lock_irqsave(&zone->lock, flags);
-		seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-		for (order = 0; order < MAX_ORDER; ++order)
-			seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
-		spin_unlock_irqrestore(&zone->lock, flags);
-		seq_putc(m, '\n');
-	}
+ 	/* Show global fragmentation statistics */
+  	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+  		if (!zone->present_pages)
+  			continue;
+
+  		spin_lock_irqsave(&zone->lock, flags);
+ 		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ 		unsigned long nr_bufs = 0;
+ 		for (order = 0; order < MAX_ORDER-1; ++order) {
+ 			nr_bufs = 0;
+
+ 			for (type=0; type < ALLOC_TYPES; type++) {
+ 				list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ 					++nr_bufs;
+ 			}
+ 			seq_printf(m, "%6lu ", nr_bufs);
+ 		}
+
+ 		/* Scan global list */
+ 		nr_bufs = 0;
+ 		list_for_each(elem, &(zone->free_area_global.free_list))
+ 			++nr_bufs;
+ 		seq_printf(m, "%6lu ", nr_bufs);
+
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 		seq_putc(m, '\n');
+ 	}
+
+ 	/* Show statistics for each allocation type */
+ 	seq_printf(m, "\nPer-allocation-type statistics");
+ 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ 		if (!zone->present_pages)
+ 			continue;
+
+ 		spin_lock_irqsave(&zone->lock, flags);
+ 		unsigned long nr_bufs = 0;
+ 		for (type=0; type < ALLOC_TYPES; type++) {
+ 			seq_printf(m, "\nNode %d, zone %8s, type %10s",
+ 					pgdat->node_id, zone->name,
+ 					type_names[type]);
+  			struct list_head *elem;
+ 			for (order = 0; order < MAX_ORDER; ++order) {
+ 				nr_bufs = 0;
+
+ 				list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ 					++nr_bufs;
+ 				seq_printf(m, "%6lu ", nr_bufs);
+ 			}
+  		}
+
+ 		/* Scan global list */
+ 		seq_printf(m, "\n");
+ 		seq_printf(m, "Node %d, zone %8s, type %10s",
+ 					pgdat->node_id, zone->name,
+ 					"MAX_ORDER");
+ 		nr_bufs = 0;
+ 		list_for_each(elem, &(zone->free_area_global.free_list))
+ 			++nr_bufs;
+ 		seq_printf(m, "%6lu ", nr_bufs);
+
+  		spin_unlock_irqrestore(&zone->lock, flags);
+  		seq_putc(m, '\n');
+  	}
+
+ 	/* Show bean counters */
+ 	seq_printf(m, "\nGlobal beancounters\n");
+ 	seq_printf(m, "Global steals:     %d\n", global_steal);
+ 	seq_printf(m, "Global refills:    %d\n", global_refill);
+ 	seq_printf(m, "KernNoRclm allocs: %d\n", kernnorclm_count);
+ 	seq_printf(m, "KernRclm allocs:   %d\n", kernrclm_count);
+ 	seq_printf(m, "UserRclm allocs:   %d\n", userrclm_count);
+ 	seq_printf(m, "%-10s Fallback count: %d\n", type_names[0],
+						    fallback_count[0]);
+ 	seq_printf(m, "%-10s Fallback count: %d\n", type_names[1],
+						    fallback_count[1]);
+ 	seq_printf(m, "%-10s Fallback count: %d\n", type_names[2],
+						    fallback_count[2]);
+
+
+
 	return 0;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH] 2/2 Satisify high-order allocations with linear scan
  2005-01-15 19:43 [PATCH] 0/0 Reducing fragmentation through better allocation Mel Gorman
  2005-01-15 19:45 ` [PATCH] 1/2 " Mel Gorman
@ 2005-01-15 19:46 ` Mel Gorman
  1 sibling, 0 replies; 3+ messages in thread
From: Mel Gorman @ 2005-01-15 19:46 UTC (permalink / raw)
  To: Linux Memory Management List; +Cc: Linux Kernel Mailing List

This patch must be applied on top of the modified allocator patch.

The purpose of this patch is to linearly scan the address space when a high-order
allocation cannot be satisfied. It takes the same parameters are try_to_free_pages()
and introduced try_to_free_highorder_pages().

The concept is quiet simple. Once a process enters here, it is given a
struct reclaim_task to record it's progress and prevent multiple processes
reclaiming the same memory. It scans the zones in the usual fallback list
and finds a block of memory that is being used for UserRclm or KernRclm
allocations. Once a block is found, it finds all LRU pages in that block,
removes them from the LRU lists and asks shrink_list() to reclaim them.

Once enough pages have been freed, it returns and tries to allocate the high-order
block again. Statistics are maintained on the number of successful or failed linear
scans and printed in /proc/buddyinfo

This patch is *highly* experimental and there is all sorts of checks that
I should be making and don't. For example, this patch does not account for
"holes" in the zone when it is scanning so this could break architectures
that have holes in the middle of the mem_map. Hence, this is proof-of-concept
only.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-mbuddy/include/linux/swap.h linux-2.6.11-rc1-lnscan/include/linux/swap.h
--- linux-2.6.11-rc1-mbuddy/include/linux/swap.h	2005-01-13 10:53:56.000000000 +0000
+++ linux-2.6.11-rc1-lnscan/include/linux/swap.h	2005-01-15 18:33:14.000000000 +0000
@@ -173,6 +173,7 @@ extern void swap_setup(void);

 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
+extern int try_to_free_highorder_pages(struct zone **, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;

diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-mbuddy/mm/page_alloc.c linux-2.6.11-rc1-lnscan/mm/page_alloc.c
--- linux-2.6.11-rc1-mbuddy/mm/page_alloc.c	2005-01-15 18:10:54.000000000 +0000
+++ linux-2.6.11-rc1-lnscan/mm/page_alloc.c	2005-01-15 18:33:14.000000000 +0000
@@ -53,6 +53,8 @@ int global_refill=0;
 int kernnorclm_count=0;
 int kernrclm_count=0;
 int userrclm_count=0;
+int lnscan_success=0;
+int lnscan_fail=0;

 EXPORT_SYMBOL(totalram_pages);
 EXPORT_SYMBOL(nr_swap_pages);
@@ -850,6 +852,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 	int alloc_type;
 	int do_retry;
 	int can_try_harder;
+	int tried_highorder=0;

 	might_sleep_if(wait);

@@ -926,6 +929,7 @@ rebalance:
 	p->flags &= ~PF_MEMALLOC;

 	/* go through the zonelist yet one more time */
+realloc:
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
 				alloc_type, can_try_harder,
@@ -933,8 +937,17 @@ rebalance:
 			continue;

 		page = buffered_rmqueue(z, order, gfp_mask);
-		if (page)
+		if (page) {
+			if (tried_highorder) lnscan_success++;
 			goto got_pg;
+		}
+		if (tried_highorder) lnscan_fail++;
+	}
+
+	if (!tried_highorder && order > 0) {
+		try_to_free_highorder_pages(zones, gfp_mask, order);
+		tried_highorder=1;
+		goto realloc;
 	}

 	/*
@@ -1962,6 +1975,9 @@ static int frag_show(struct seq_file *m,
  	seq_printf(m, "KernNoRclm allocs: %d\n", kernnorclm_count);
  	seq_printf(m, "KernRclm allocs:   %d\n", kernrclm_count);
  	seq_printf(m, "UserRclm allocs:   %d\n", userrclm_count);
+ 	seq_printf(m, "lnscan success:    %d\n", lnscan_success);
+ 	seq_printf(m, "lnscan fail:       %d\n", lnscan_fail);
+
  	seq_printf(m, "%-10s Fallback count: %d\n", type_names[0],
 						    fallback_count[0]);
  	seq_printf(m, "%-10s Fallback count: %d\n", type_names[1],
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-mbuddy/mm/vmscan.c linux-2.6.11-rc1-lnscan/mm/vmscan.c
--- linux-2.6.11-rc1-mbuddy/mm/vmscan.c	2005-01-13 10:54:05.000000000 +0000
+++ linux-2.6.11-rc1-lnscan/mm/vmscan.c	2005-01-15 18:54:14.000000000 +0000
@@ -372,6 +372,7 @@ static int shrink_list(struct list_head
 		BUG_ON(PageActive(page));

 		sc->nr_scanned++;
+
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
@@ -1271,4 +1272,299 @@ static int __init kswapd_init(void)
 	return 0;
 }

+/***
+ * Below this point is a different type of scanner. It is a linear scanner
+ * that tries to find large blocks of pages to be freed. Very experimental
+ * at the moment
+ *
+ * --mel
+ */
+
+/*
+ * Describes a single reclaim task. Only one process is allowed to scan
+ * a single 2^MAX_ORDER block of pages at a time. The active relcimas are
+ * kept on a reclaim_task linked list protected by reclaims_lock
+ */
+struct reclaim_task {
+	struct zone* zone;
+	struct list_head list;
+	int index;
+};
+
+struct list_head active_reclaims;
+spinlock_t reclaims_lock;
+
+/*
+ * Remember the last index scanned in a zone so the same zones do not get
+ * scanned repeatadly. This is not perfect at all as the index used is the
+ * zone index of the fallback list, not the real zone or node ID. Needs
+ * a better solution but will do for this proof of concept
+ */
+int last_reclaim_index[MAX_NR_ZONES];
+
+/*
+ * Find a block to start reclaiming pages from. Returns 1 when a suitable
+ * block is found
+ */
+int find_startblock(int zoneid, struct reclaim_task *rtask) {
+	int startindex;
+	int retval=1;
+
+	/* rtask->index will be -1 the first time a reclaim task is created */
+ 	if (rtask->index == -1) {
+		startindex = last_reclaim_index[zoneid];
+		rtask->index = startindex;
+	} else {
+		startindex = rtask->index;
+	}
+
+	/* Be sure we do not start past the zone boundary */
+	if (rtask->index >= (rtask->zone->present_pages >> MAX_ORDER)-1) {
+		rtask->index=startindex=0;
+	}
+
+
+	do {
+		int bitidx = rtask->index*2;
+
+		/* Test if this region is for UserRclm or KernRclm pages */
+		if (test_bit(bitidx, rtask->zone->free_area_usemap) ||
+		    test_bit(bitidx+1,rtask->zone->free_area_usemap)) {
+			struct list_head *curr;
+			struct reclaim_task *ltask;
+			int success=1;
+
+			/* Make sure no other task is scanning here */
+			spin_lock(&reclaims_lock);
+			list_for_each(curr, &active_reclaims) {
+				ltask = list_entry(curr,
+						struct reclaim_task,
+						list);
+				if (ltask->index == rtask->index &&
+				    ltask != rtask) {
+					success=0;
+					break;
+				}
+			}
+			spin_unlock(&reclaims_lock);
+
+			if (success) {
+				last_reclaim_index[zoneid] = rtask->index;
+				goto out;
+			}
+		}
+
+		/* Wrap around when we reach the end of the zone */
+		if (++rtask->index >=
+				(rtask->zone->present_pages >> MAX_ORDER)-1)
+			rtask->index=0;
+
+	} while (rtask->index != startindex);
+
+	/* Failed to find a suitable block */
+	retval = 0;
+out:
+	return retval;
+}
+
+/*
+ * Tries to free 2^MAX_ORDER blocks of pages starting from page
+ */
+int try_to_free_highorder_block(struct reclaim_task *rtask,
+				unsigned int order, int gfp_mask) {
+	struct page *page;
+	struct page *endpage;
+	struct page *endpageblock;
+	LIST_HEAD(free_canditates);
+	int nr_freed=0;
+	struct scan_control sc;
+	struct pagevec pvec;
+	int tryfree;
+	struct zone* zone = rtask->zone;
+
+	page = zone->zone_mem_map + (rtask->index * (1 << MAX_ORDER));
+	endpageblock = page + (1 << MAX_ORDER);
+	if (endpageblock >
+		zone->zone_mem_map + zone->present_pages)
+		endpageblock =
+			zone->zone_mem_map + zone->present_pages;
+
+retry:
+	endpage = page + (1 << order);
+	if (endpage > endpageblock) endpage = endpageblock;
+
+	/* Scan the whole block of pages */
+	tryfree=0;
+	spin_lock_irq(&zone->lru_lock);
+	while (page <= endpage && page <= endpageblock) {
+
+		/*
+		 * Only free LRU pages for now. In the future, we would also
+		 * want to scan slab pages here. The problem with slab pages
+		 * is that the lru list cannot be used as it is already used
+		 * to track what slab and cache it belongs to
+		 */
+		if (PageLRU(page)) {
+			list_del(&page->lru);
+			if (!TestClearPageLRU(page)) BUG();
+
+			/*
+			 * This will disrupt LRU ordering but in this path,
+			 * we don't really care
+			 */
+			ClearPageActive(page);
+
+			if (get_page_testone(page)) {
+				/*
+				 * It is being freed elsewhere. Put back on
+				 * LRU, move to next page
+				 */
+				__put_page(page);
+				SetPageLRU(page);
+				list_add(&page->lru,
+					&page_zone(page)->inactive_list);
+				page++;
+				continue;
+			}
+
+			list_add(&page->lru, &free_canditates);
+			tryfree++;
+		}
+
+		/* Move to next page to test */
+		page++;
+	}
+	spin_unlock_irq(&zone->lru_lock);
+
+	/* Make sure pages were found */
+	if (list_empty(&free_canditates)) {
+		/* See if it is worth going back again */
+		if (endpageblock - endpage > (1 << order)) goto retry;
+		return nr_freed;
+	}
+
+	/* Construct a scan_control for shrink_list */
+	sc.gfp_mask = gfp_mask;
+	sc.may_writepage = 1;
+	sc.nr_scanned = 0;
+	sc.nr_reclaimed = 0;
+	sc.priority = 0;
+
+	nr_freed += shrink_list(&free_canditates, &sc);
+	if (list_empty(&free_canditates)) {
+		int tofree;
+		if (nr_freed >= (1 << order)) return nr_freed;
+
+		/* Not enough free, worth going back? */
+		tofree = (1 << order) - nr_freed;
+		if (endpageblock - endpage > tofree) goto retry;
+		return nr_freed;
+	}
+
+	/*
+	 * Free up leftover pages
+	 */
+	tryfree=0;
+	pagevec_init(&pvec, 1);
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(&free_canditates)) {
+		page = lru_to_page(&free_canditates);
+		tryfree++;
+		if (TestSetPageLRU(page))
+			BUG();
+		list_del(&page->lru);
+		if (PageActive(page))
+			add_page_to_active_list(rtask->zone, page);
+		else
+			add_page_to_inactive_list(rtask->zone, page);
+		if (!pagevec_add(&pvec, page)) {
+			spin_unlock_irq(&rtask->zone->lru_lock);
+			__pagevec_release(&pvec);
+			spin_lock_irq(&rtask->zone->lru_lock);
+		}
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	pagevec_release(&pvec);
+	return nr_freed;
+
+}
+
+/**
+ * try_to_free_highorder_pages - Linearaly scan and reclaim for high orders
+ * zones - Fallback list of zones to scan
+ * gfp_flags - Flags used for the allocation
+ * order - The order-size block of pages to free
+ *
+ * Warning, this is potentially a very long running function that has no
+ * guarentee of success.
+ */
+int try_to_free_highorder_pages(struct zone** zones,
+					  unsigned int gfp_flags,
+					  unsigned int order) {
+
+	int i, nr_freed;
+	struct reclaim_task *rtask = NULL;
+
+	/* Make sure the right flags are set to do the required work */
+	if (! gfp_flags & __GFP_WAIT ||
+	    ! gfp_flags & __GFP_IO   ||
+	    ! gfp_flags & __GFP_FS) return 0;
+
+	/*
+	 * Create a reclaim task to track our progress. The reclaim task
+	 * is added now before we start to make sure there is exclusively
+	 * one task scanning a 2^MAX_ORDER block
+	 */
+	rtask = kmalloc(sizeof(struct reclaim_task), gfp_flags);
+	memset(rtask, 0, sizeof(struct reclaim_task));
+	spin_lock(&reclaims_lock);
+	list_add(&rtask->list, &active_reclaims);
+	spin_unlock(&reclaims_lock);
+	nr_freed=0;
+
+	for (i = 0; zones[i] != NULL && nr_freed < (1 << order); i++) {
+		int scanblocks;
+
+		/* Reset the index for a new zone */
+		rtask->index=-1;
+		rtask->zone = zones[i];
+
+		/*
+		 * Try scan at least 16 blocks before changing zones. There
+		 * is no significance to the number 16
+		 */
+		for (scanblocks=16;
+			scanblocks >= 0 && nr_freed<(1 << order) ;
+			scanblocks--) {
+
+			if (!find_startblock(i, rtask)) break;
+
+			/* Try and free the block found by find_startblock */
+			nr_freed += try_to_free_highorder_block(rtask,
+								order,
+								gfp_flags);
+
+			rtask->index++;
+
+		}
+
+	}
+
+	spin_lock(&reclaims_lock);
+	list_del(&rtask->list);
+	spin_unlock(&reclaims_lock);
+	kfree(rtask);
+	return nr_freed;
+
+}
+
+static int __init lnscan_init(void) {
+	printk("Initialising lnscan\n");
+	spin_lock_init(&reclaims_lock);
+	INIT_LIST_HEAD(&active_reclaims);
+	memset(last_reclaim_index, 0, sizeof(last_reclaim_index));
+	return 0;
+}
+
 module_init(kswapd_init)
+module_init(lnscan_init);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"aart@kvack.org"> aart@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2005-01-15 19:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-15 19:43 [PATCH] 0/0 Reducing fragmentation through better allocation Mel Gorman
2005-01-15 19:45 ` [PATCH] 1/2 " Mel Gorman
2005-01-15 19:46 ` [PATCH] 2/2 Satisify high-order allocations with linear scan Mel Gorman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox