* [PATCH 1/8] Fragmentation Avoidance V17: 001_antidefrag_flags
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-11 15:12 ` [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap Mel Gorman
` (7 subsequent siblings)
8 siblings, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
This patch adds two flags __GFP_USER and __GFP_KERNRCLM that are used to trap
the type of allocation the caller is made. Allocations using the __GFP_USER
flag are expected to be easily reclaimed by syncing with backing storage (be
it a file or swap) or cleaning the buffers and discarding. Allocations using
the __GFP_KERNRCLM flag belong to slab caches that can be shrunk by the kernel.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/buffer.c linux-2.6.14-rc3-001_antidefrag_flags/fs/buffer.c
--- linux-2.6.14-rc3-clean/fs/buffer.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/buffer.c 2005-10-11 12:06:46.000000000 +0100
@@ -1119,7 +1119,12 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ /*
+ * Mark as __GFP_USER because from a fragmentation avoidance and
+ * reclamation point of view this memory behaves like user memory.
+ */
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USER);
if (!page)
return NULL;
@@ -3047,7 +3052,8 @@ static void recalc_bh_state(void)
struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
get_cpu_var(bh_accounting).nr++;
recalc_bh_state();
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/compat.c linux-2.6.14-rc3-001_antidefrag_flags/fs/compat.c
--- linux-2.6.14-rc3-clean/fs/compat.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/compat.c 2005-10-11 12:06:46.000000000 +0100
@@ -1352,7 +1352,7 @@ static int compat_copy_strings(int argc,
page = bprm->page[i];
new = 0;
if (!page) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_HIGHUSER|__GFP_USER);
bprm->page[i] = page;
if (!page) {
ret = -ENOMEM;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/dcache.c linux-2.6.14-rc3-001_antidefrag_flags/fs/dcache.c
--- linux-2.6.14-rc3-clean/fs/dcache.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/dcache.c 2005-10-11 12:06:46.000000000 +0100
@@ -714,7 +714,7 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;
- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/exec.c linux-2.6.14-rc3-001_antidefrag_flags/fs/exec.c
--- linux-2.6.14-rc3-clean/fs/exec.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/exec.c 2005-10-11 12:06:46.000000000 +0100
@@ -237,7 +237,7 @@ static int copy_strings(int argc, char _
page = bprm->page[i];
new = 0;
if (!page) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_HIGHUSER|__GFP_USER);
bprm->page[i] = page;
if (!page) {
ret = -ENOMEM;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/ext2/super.c linux-2.6.14-rc3-001_antidefrag_flags/fs/ext2/super.c
--- linux-2.6.14-rc3-clean/fs/ext2/super.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/ext2/super.c 2005-10-11 12:06:46.000000000 +0100
@@ -141,7 +141,8 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep,
+ SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/ext3/super.c linux-2.6.14-rc3-001_antidefrag_flags/fs/ext3/super.c
--- linux-2.6.14-rc3-clean/fs/ext3/super.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/ext3/super.c 2005-10-11 12:06:46.000000000 +0100
@@ -441,7 +441,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;
- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/inode.c linux-2.6.14-rc3-001_antidefrag_flags/fs/inode.c
--- linux-2.6.14-rc3-clean/fs/inode.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/inode.c 2005-10-11 12:06:46.000000000 +0100
@@ -146,7 +146,7 @@ static struct inode *alloc_inode(struct
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+ mapping_set_gfp_mask(mapping, GFP_HIGHUSER|__GFP_USER);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/fs/ntfs/inode.c linux-2.6.14-rc3-001_antidefrag_flags/fs/ntfs/inode.c
--- linux-2.6.14-rc3-clean/fs/ntfs/inode.c 2005-10-04 22:58:33.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/fs/ntfs/inode.c 2005-10-11 12:06:46.000000000 +0100
@@ -317,7 +317,7 @@ struct inode *ntfs_alloc_big_inode(struc
ntfs_inode *ni;
ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -342,7 +342,7 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;
ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/include/asm-i386/page.h linux-2.6.14-rc3-001_antidefrag_flags/include/asm-i386/page.h
--- linux-2.6.14-rc3-clean/include/asm-i386/page.h 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/include/asm-i386/page.h 2005-10-11 12:06:46.000000000 +0100
@@ -36,7 +36,7 @@
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | __GFP_USER, vma, vaddr)
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
/*
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/include/linux/gfp.h linux-2.6.14-rc3-001_antidefrag_flags/include/linux/gfp.h
--- linux-2.6.14-rc3-clean/include/linux/gfp.h 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/include/linux/gfp.h 2005-10-11 12:06:46.000000000 +0100
@@ -42,14 +42,26 @@ struct vm_area_struct;
#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */
#define __GFP_HARDWALL 0x40000u /* Enforce hardwall cpuset memory allocs */
-#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
+/*
+ * Allocation type modifiers, these are required to be adjacent
+ * __GPF_USER: Allocation for user page or a buffer page
+ * __GFP_KERNRCLM: Short-lived or reclaimable kernel allocation
+ * Both bits off: Kernel non-reclaimable or very hard to reclaim
+ * RCLM_SHIFT (defined elsewhere) depends on the location of these bits
+ */
+#define __GFP_USER 0x80000u /* User and other easily reclaimed pages */
+#define __GFP_KERNRCLM 0x100000u /* Kernel page that is reclaimable */
+#define __GFP_RCLM_BITS (__GFP_USER|__GFP_KERNRCLM)
+
+#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
/* if you forget to add the bitmask here kernel will crash, period */
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL)
+ __GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL | \
+ __GFP_USER | __GFP_KERNRCLM)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/include/linux/highmem.h linux-2.6.14-rc3-001_antidefrag_flags/include/linux/highmem.h
--- linux-2.6.14-rc3-clean/include/linux/highmem.h 2005-08-29 00:41:01.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/include/linux/highmem.h 2005-10-11 12:06:46.000000000 +0100
@@ -47,7 +47,7 @@ static inline void clear_user_highpage(s
static inline struct page *
alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
{
- struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
+ struct page *page = alloc_page_vma(GFP_HIGHUSER|__GFP_USER, vma, vaddr);
if (page)
clear_user_highpage(page, vaddr);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/mm/memory.c linux-2.6.14-rc3-001_antidefrag_flags/mm/memory.c
--- linux-2.6.14-rc3-clean/mm/memory.c 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/mm/memory.c 2005-10-11 12:06:46.000000000 +0100
@@ -1296,7 +1296,8 @@ static int do_wp_page(struct mm_struct *
if (!new_page)
goto no_new_page;
} else {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ new_page = alloc_page_vma(GFP_HIGHUSER|__GFP_USER,
+ vma, address);
if (!new_page)
goto no_new_page;
copy_user_highpage(new_page, old_page, address);
@@ -1871,7 +1872,7 @@ retry:
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+ page = alloc_page_vma(GFP_HIGHUSER | __GFP_USER, vma, address);
if (!page)
goto oom;
copy_user_highpage(page, new_page, address);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/mm/shmem.c linux-2.6.14-rc3-001_antidefrag_flags/mm/shmem.c
--- linux-2.6.14-rc3-clean/mm/shmem.c 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/mm/shmem.c 2005-10-11 12:06:46.000000000 +0100
@@ -908,7 +908,7 @@ shmem_alloc_page(unsigned long gfp, stru
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
pvma.vm_pgoff = idx;
pvma.vm_end = PAGE_SIZE;
- page = alloc_page_vma(gfp | __GFP_ZERO, &pvma, 0);
+ page = alloc_page_vma(gfp | __GFP_ZERO | __GFP_USER, &pvma, 0);
mpol_free(pvma.vm_policy);
return page;
}
@@ -924,7 +924,7 @@ static inline struct page *
shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
unsigned long idx)
{
- return alloc_page(gfp | __GFP_ZERO);
+ return alloc_page(gfp | __GFP_ZERO | __GFP_USER);
}
#endif
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-clean/mm/swap_state.c linux-2.6.14-rc3-001_antidefrag_flags/mm/swap_state.c
--- linux-2.6.14-rc3-clean/mm/swap_state.c 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-001_antidefrag_flags/mm/swap_state.c 2005-10-11 12:06:46.000000000 +0100
@@ -335,7 +335,8 @@ struct page *read_swap_cache_async(swp_e
* Get a new page to read into from swap.
*/
if (!new_page) {
- new_page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ new_page = alloc_page_vma(GFP_HIGHUSER | __GFP_USER,
+ vma, addr);
if (!new_page)
break; /* Out of memory */
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
2005-10-11 15:12 ` [PATCH 1/8] Fragmentation Avoidance V17: 001_antidefrag_flags Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-13 13:56 ` Dave Hansen
2005-10-11 15:12 ` [PATCH 3/8] Fragmentation Avoidance V17: 003_fragcore Mel Gorman
` (6 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
This patch adds a "usemap" to the allocator. When a PAGE_PER_MAXORDER block
of pages (i.e. 2^(MAX_ORDER-1)) is split, the usemap is updated with the
type of allocation when splitting. This information is used in an
anti-fragmentation patch to group related allocation types together.
The __GFP_USER and __GFP_KERNRCLM bits are used to enumerate three allocation
types;
RCLM_NORLM: These are kernel allocations that cannot be reclaimed
on demand.
RCLM_USER: These are pages allocated with __GFP_USER flag set. They are
considered to be user and other easily reclaimed pages such
as buffers
RCLM_KERN: Allocated for the kernel but for caches that can be reclaimed
on demand.
gfpflags_to_rclmtype() converts gfp_flags to their corresponding RCLM_TYPE
by masking out irrelevant bits and shifting the result right by RCLM_SHIFT.
Compile-time checks are made on RCLM_SHIFT to ensure gfpflags_to_rclmtype()
keeps working. ffz() could be used to avoid static checks, but it would be
runtime overhead for a compile-time constant.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-001_antidefrag_flags/include/linux/mm.h linux-2.6.14-rc3-002_usemap/include/linux/mm.h
--- linux-2.6.14-rc3-001_antidefrag_flags/include/linux/mm.h 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-002_usemap/include/linux/mm.h 2005-10-11 12:07:26.000000000 +0100
@@ -522,6 +522,13 @@ static inline void set_page_links(struct
extern struct page *mem_map;
#endif
+/*
+ * Return what type of page is being allocated from this 2^MAX_ORDER-1 block
+ * of pages.
+ */
+extern int get_pageblock_type(struct zone *zone,
+ struct page *page);
+
static inline void *lowmem_page_address(struct page *page)
{
return __va(page_to_pfn(page) << PAGE_SHIFT);
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-001_antidefrag_flags/include/linux/mmzone.h linux-2.6.14-rc3-002_usemap/include/linux/mmzone.h
--- linux-2.6.14-rc3-001_antidefrag_flags/include/linux/mmzone.h 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-002_usemap/include/linux/mmzone.h 2005-10-11 12:07:26.000000000 +0100
@@ -20,6 +20,17 @@
#else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
+#define PAGES_PER_MAXORDER (1 << (MAX_ORDER-1))
+
+/*
+ * The two bit field __GFP_RECLAIMBITS enumerates the following 4 kinds of
+ * page reclaimability.
+ */
+#define RCLM_NORCLM 0
+#define RCLM_USER 1
+#define RCLM_KERN 2
+#define RCLM_TYPES 3
+#define BITS_PER_RCLM_TYPE 2
struct free_area {
struct list_head free_list;
@@ -139,6 +150,13 @@ struct zone {
spinlock_t lock;
struct free_area free_area[MAX_ORDER];
+#ifndef CONFIG_SPARSEMEM
+ /*
+ * The map tracks what each 2^MAX_ORDER-1 sized block is being used for.
+ * Each 2^MAX_ORDER block of pages uses BITS_PER_RCLM_TYPE bits
+ */
+ unsigned long *free_area_usemap;
+#endif
ZONE_PADDING(_pad1_)
@@ -473,6 +491,15 @@ extern struct pglist_data contig_page_da
#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif
+#if ((SECTION_SIZE_BITS - MAX_ORDER) * BITS_PER_RCLM_TYPE) > 64
+#error free_area_usemap is not big enough
+#endif
+
+/* Usemap initialisation */
+#ifdef CONFIG_SPARSEMEM
+static inline void setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long zonesize) {}
+#endif /* CONFIG_SPARSEMEM */
struct page;
struct mem_section {
@@ -485,6 +512,7 @@ struct mem_section {
* before using it wrong.
*/
unsigned long section_mem_map;
+ DECLARE_BITMAP(free_area_usemap,64);
};
#ifdef CONFIG_SPARSEMEM_EXTREME
@@ -552,6 +580,17 @@ static inline struct mem_section *__pfn_
return __nr_to_section(pfn_to_section_nr(pfn));
}
+static inline unsigned long *pfn_to_usemap(struct zone *zone, unsigned long pfn)
+{
+ return &__pfn_to_section(pfn)->free_area_usemap[0];
+}
+
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+ pfn &= (PAGES_PER_SECTION-1);
+ return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
+}
+
#define pfn_to_page(pfn) \
({ \
unsigned long __pfn = (pfn); \
@@ -589,6 +628,16 @@ void sparse_init(void);
#else
#define sparse_init() do {} while (0)
#define sparse_index_init(_sec, _nid) do {} while (0)
+static inline unsigned long *pfn_to_usemap(struct zone *zone,
+ unsigned long pfn)
+{
+ return (zone->free_area_usemap);
+}
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+ pfn = pfn - zone->zone_start_pfn;
+ return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
+}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-001_antidefrag_flags/mm/page_alloc.c linux-2.6.14-rc3-002_usemap/mm/page_alloc.c
--- linux-2.6.14-rc3-001_antidefrag_flags/mm/page_alloc.c 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-002_usemap/mm/page_alloc.c 2005-10-11 12:07:26.000000000 +0100
@@ -66,6 +66,88 @@ EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
/*
+ * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
+ * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
+ * made afterwards in case the GFP flags are not updated without updating
+ * this number
+ */
+#define RCLM_SHIFT 19
+#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
+#error __GFP_USER not mapping to RCLM_USER
+#endif
+#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
+#error __GFP_KERNRCLM not mapping to RCLM_KERN
+#endif
+
+/*
+ * This function maps gfpflags to their RCLM_TYPE. It makes assumptions
+ * on the location of the GFP flags
+ */
+static inline unsigned long gfpflags_to_rclmtype(unsigned long gfp_flags) {
+ return (gfp_flags & __GFP_RCLM_BITS) >> RCLM_SHIFT;
+}
+
+/*
+ * copy_bits - Copy bits between bitmaps
+ * @dstaddr: The destination bitmap to copy to
+ * @srcaddr: The source bitmap to copy from
+ * @sindex_dst: The start bit index within the destination map to copy to
+ * @sindex_src: The start bit index within the source map to copy from
+ * @nr: The number of bits to copy
+ */
+static inline void copy_bits(unsigned long *dstaddr,
+ unsigned long *srcaddr,
+ int sindex_dst,
+ int sindex_src,
+ int nr)
+{
+ /*
+ * Written like this to take advantage of arch-specific
+ * set_bit() and clear_bit() functions
+ */
+ for (nr = nr-1; nr >= 0; nr--) {
+ int bit = test_bit(sindex_src + nr, srcaddr);
+ if (bit)
+ set_bit(sindex_dst + nr, dstaddr);
+ else
+ clear_bit(sindex_dst + nr, dstaddr);
+ }
+}
+
+int get_pageblock_type(struct zone *zone,
+ struct page *page)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long type = 0;
+ unsigned long *usemap;
+ int bitidx;
+
+ bitidx = pfn_to_bitidx(zone, pfn);
+ usemap = pfn_to_usemap(zone, pfn);
+
+ copy_bits(&type, usemap, 0, bitidx, BITS_PER_RCLM_TYPE);
+
+ return (int)type;
+}
+
+/*
+ * Reserve a block of pages for an allocation type & enforce function
+ * being changed if more bits are added to keep track of additional types
+ */
+static inline void set_pageblock_type(struct zone *zone, struct page *page,
+ unsigned long type)
+{
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long *usemap;
+ int bitidx;
+
+ bitidx = pfn_to_bitidx(zone, pfn);
+ usemap = pfn_to_usemap(zone, pfn);
+
+ copy_bits(usemap, &type, bitidx, 0, BITS_PER_RCLM_TYPE);
+}
+
+/*
* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
*/
@@ -470,7 +552,8 @@ static void prep_new_page(struct page *p
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static struct page *__rmqueue(struct zone *zone, unsigned int order,
+ int alloctype)
{
struct free_area * area;
unsigned int current_order;
@@ -486,6 +569,15 @@ static struct page *__rmqueue(struct zon
rmv_page_order(page);
area->nr_free--;
zone->free_pages -= 1UL << order;
+
+ /*
+ * If splitting a large block, record what the block is being
+ * used for in the usemap
+ */
+ if (current_order == MAX_ORDER-1)
+ set_pageblock_type(zone, page,
+ (unsigned long)alloctype);
+
return expand(zone, page, order, current_order, area);
}
@@ -498,7 +590,8 @@ static struct page *__rmqueue(struct zon
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
- unsigned long count, struct list_head *list)
+ unsigned long count, struct list_head *list,
+ int alloctype)
{
unsigned long flags;
int i;
@@ -507,7 +600,7 @@ static int rmqueue_bulk(struct zone *zon
spin_lock_irqsave(&zone->lock, flags);
for (i = 0; i < count; ++i) {
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, alloctype);
if (page == NULL)
break;
allocated++;
@@ -691,7 +784,8 @@ buffered_rmqueue(struct zone *zone, int
unsigned long flags;
struct page *page = NULL;
int cold = !!(gfp_flags & __GFP_COLD);
-
+ int alloctype = (int)gfpflags_to_rclmtype(gfp_flags);
+
if (order == 0) {
struct per_cpu_pages *pcp;
@@ -699,7 +793,8 @@ buffered_rmqueue(struct zone *zone, int
local_irq_save(flags);
if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list);
+ pcp->batch, &pcp->list,
+ alloctype);
if (pcp->count) {
page = list_entry(pcp->list.next, struct page, lru);
list_del(&page->lru);
@@ -711,7 +806,7 @@ buffered_rmqueue(struct zone *zone, int
if (page == NULL) {
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order);
+ page = __rmqueue(zone, order, alloctype);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -1870,6 +1965,36 @@ void __init setup_per_cpu_pageset()
#endif
+#ifndef CONFIG_SPARSEMEM
+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
+/*
+ * Calculate the size of the zone->usemap in bytes rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of MAX_ORDER-1 by rounding up
+ * Then figure 1 RCLM_TYPE worth of bits per MAX_ORDER-1, finally round up
+ * what is now in bits to nearest long in bits, then return it in bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zonesize)
+{
+ unsigned long usemapsize;
+
+ usemapsize = roundup(zonesize, PAGES_PER_MAXORDER);
+ usemapsize = usemapsize >> (MAX_ORDER-1);
+ usemapsize *= BITS_PER_RCLM_TYPE;
+ usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+
+ return usemapsize / 8;
+}
+
+static void __init setup_usemap(struct pglist_data *pgdat,
+ struct zone *zone, unsigned long zonesize)
+{
+ unsigned long usemapsize = usemap_size(zonesize);
+ zone->free_area_usemap = alloc_bootmem_node(pgdat, usemapsize);
+ memset(zone->free_area_usemap, RCLM_NORCLM, usemapsize);
+}
+#endif /* CONFIG_SPARSEMEM */
+
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -1955,6 +2080,7 @@ static void __init free_area_init_core(s
memmap_init(size, nid, j, zone_start_pfn);
zonetable_add(zone, nid, j, zone_start_pfn, size);
+ setup_usemap(pgdat, zone, size);
zone_start_pfn += size;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-11 15:12 ` [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap Mel Gorman
@ 2005-10-13 13:56 ` Dave Hansen
2005-10-13 14:10 ` Mel Gorman
0 siblings, 1 reply; 27+ messages in thread
From: Dave Hansen @ 2005-10-13 13:56 UTC (permalink / raw)
To: Mel Gorman
Cc: Andrew Morton, jschopp, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Tue, 2005-10-11 at 16:12 +0100, Mel Gorman wrote:
> +extern int get_pageblock_type(struct zone *zone,
> + struct page *page);
That indenting is pretty funky.
> @@ -473,6 +491,15 @@ extern struct pglist_data contig_page_da
> #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
> #error Allocator MAX_ORDER exceeds SECTION_SIZE
> #endif
> +#if ((SECTION_SIZE_BITS - MAX_ORDER) * BITS_PER_RCLM_TYPE) > 64
> +#error free_area_usemap is not big enough
> +#endif
Every time I look at these patches, I see this #if, and I don't remember
what that '64' means. Can it please get a real name?
> +/* Usemap initialisation */
> +#ifdef CONFIG_SPARSEMEM
> +static inline void setup_usemap(struct pglist_data *pgdat,
> + struct zone *zone, unsigned long zonesize) {}
> +#endif /* CONFIG_SPARSEMEM */
>
> struct page;
> struct mem_section {
> @@ -485,6 +512,7 @@ struct mem_section {
> * before using it wrong.
> */
> unsigned long section_mem_map;
> + DECLARE_BITMAP(free_area_usemap,64);
> };
There's that '64' again! You need a space after the comma, too.
> #ifdef CONFIG_SPARSEMEM_EXTREME
> @@ -552,6 +580,17 @@ static inline struct mem_section *__pfn_
> return __nr_to_section(pfn_to_section_nr(pfn));
> }
>
> +static inline unsigned long *pfn_to_usemap(struct zone *zone, unsigned long pfn)
> +{
> + return &__pfn_to_section(pfn)->free_area_usemap[0];
> +}
> +
> +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
> +{
> + pfn &= (PAGES_PER_SECTION-1);
> + return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
> +}
Why does that return int? Should it be "unsigned long", maybe? Also,
that cast is implicit in the return and shouldn't be needed.
> #define pfn_to_page(pfn) \
> ({ \
> unsigned long __pfn = (pfn); \
> @@ -589,6 +628,16 @@ void sparse_init(void);
> #else
> #define sparse_init() do {} while (0)
> #define sparse_index_init(_sec, _nid) do {} while (0)
> +static inline unsigned long *pfn_to_usemap(struct zone *zone,
> + unsigned long pfn)
> +{
> + return (zone->free_area_usemap);
> +}
"return" is not a function. The parenthesis can go away.
> +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
> +{
> + pfn = pfn - zone->zone_start_pfn;
> + return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
> +}
Again, why the cast?
> #endif /* CONFIG_SPARSEMEM */
>
> #ifdef CONFIG_NODES_SPAN_OTHER_NODES
> diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-001_antidefrag_flags/mm/page_alloc.c linux-2.6.14-rc3-002_usemap/mm/page_alloc.c
> --- linux-2.6.14-rc3-001_antidefrag_flags/mm/page_alloc.c 2005-10-04 22:58:34.000000000 +0100
> +++ linux-2.6.14-rc3-002_usemap/mm/page_alloc.c 2005-10-11 12:07:26.000000000 +0100
> @@ -66,6 +66,88 @@ EXPORT_SYMBOL(totalram_pages);
> EXPORT_SYMBOL(nr_swap_pages);
>
> /*
> + * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
> + * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
> + * made afterwards in case the GFP flags are not updated without updating
> + * this number
> + */
> +#define RCLM_SHIFT 19
> +#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
> +#error __GFP_USER not mapping to RCLM_USER
> +#endif
> +#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
> +#error __GFP_KERNRCLM not mapping to RCLM_KERN
> +#endif
Should this really be in page_alloc.c, or should it be close to the
RCLM_* definitions?
> +/*
> + * This function maps gfpflags to their RCLM_TYPE. It makes assumptions
> + * on the location of the GFP flags
> + */
> +static inline unsigned long gfpflags_to_rclmtype(unsigned long gfp_flags) {
> + return (gfp_flags & __GFP_RCLM_BITS) >> RCLM_SHIFT;
> +}
First brace on a new line, please.
> +/*
> + * copy_bits - Copy bits between bitmaps
> + * @dstaddr: The destination bitmap to copy to
> + * @srcaddr: The source bitmap to copy from
> + * @sindex_dst: The start bit index within the destination map to copy to
> + * @sindex_src: The start bit index within the source map to copy from
> + * @nr: The number of bits to copy
> + */
> +static inline void copy_bits(unsigned long *dstaddr,
> + unsigned long *srcaddr,
> + int sindex_dst,
> + int sindex_src,
> + int nr)
> +{
> + /*
> + * Written like this to take advantage of arch-specific
> + * set_bit() and clear_bit() functions
> + */
> + for (nr = nr-1; nr >= 0; nr--) {
> + int bit = test_bit(sindex_src + nr, srcaddr);
> + if (bit)
> + set_bit(sindex_dst + nr, dstaddr);
> + else
> + clear_bit(sindex_dst + nr, dstaddr);
> + }
> +}
We might want to make a note that this is slow, and doesn't have any
atomicity guarantees, either. But, it's functional, and certainly
describes the operation that we wanted. It's an improvement over what
was there.
BTW, this could probably be done with some fancy bitmask operations, and
be more efficient.
> +int get_pageblock_type(struct zone *zone,
> + struct page *page)
> +{
> + unsigned long pfn = page_to_pfn(page);
> + unsigned long type = 0;
> + unsigned long *usemap;
> + int bitidx;
> +
> + bitidx = pfn_to_bitidx(zone, pfn);
> + usemap = pfn_to_usemap(zone, pfn);
> +
> + copy_bits(&type, usemap, 0, bitidx, BITS_PER_RCLM_TYPE);
> +
> + return (int)type;
> +}
Where did all these casts come from?
> -static struct page *__rmqueue(struct zone *zone, unsigned int order)
> +static struct page *__rmqueue(struct zone *zone, unsigned int order,
> + int alloctype)
> {
> struct free_area * area;
> unsigned int current_order;
> @@ -486,6 +569,15 @@ static struct page *__rmqueue(struct zon
> rmv_page_order(page);
> area->nr_free--;
> zone->free_pages -= 1UL << order;
> +
> + /*
> + * If splitting a large block, record what the block is being
> + * used for in the usemap
> + */
> + if (current_order == MAX_ORDER-1)
> + set_pageblock_type(zone, page,
> + (unsigned long)alloctype);
This cast makes that line wrap, and makes it quite a bit less readable.
> return expand(zone, page, order, current_order, area);
> }
>
> @@ -498,7 +590,8 @@ static struct page *__rmqueue(struct zon
> * Returns the number of new pages which were placed at *list.
> */
> static int rmqueue_bulk(struct zone *zone, unsigned int order,
> - unsigned long count, struct list_head *list)
> + unsigned long count, struct list_head *list,
> + int alloctype)
> {
> unsigned long flags;
> int i;
> @@ -507,7 +600,7 @@ static int rmqueue_bulk(struct zone *zon
>
> spin_lock_irqsave(&zone->lock, flags);
> for (i = 0; i < count; ++i) {
> - page = __rmqueue(zone, order);
> + page = __rmqueue(zone, order, alloctype);
> if (page == NULL)
> break;
> allocated++;
> @@ -691,7 +784,8 @@ buffered_rmqueue(struct zone *zone, int
> unsigned long flags;
> struct page *page = NULL;
> int cold = !!(gfp_flags & __GFP_COLD);
> -
> + int alloctype = (int)gfpflags_to_rclmtype(gfp_flags);
> +
Just a note: that inserts whitespace.
You can avoid these kinds of mistakes by keeping incremental patches.
Start with your last set of work, and keep all changes to _that_ work in
separate patches on top of the old. You'll see only what you change and
little tiny mishaps like this will tend to jump out more than in the
hundreds of lines of the whole thing.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-13 13:56 ` Dave Hansen
@ 2005-10-13 14:10 ` Mel Gorman
2005-10-13 14:18 ` Dave Hansen
2005-10-13 16:33 ` Joel Schopp
0 siblings, 2 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-13 14:10 UTC (permalink / raw)
To: Dave Hansen
Cc: Andrew Morton, jschopp, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Thu, 13 Oct 2005, Dave Hansen wrote:
> On Tue, 2005-10-11 at 16:12 +0100, Mel Gorman wrote:
> > + extern int get_pageblock_type(struct zone *zone,
> > + struct page *page);
>
> That indenting is pretty funky.
>
Unnecessary as well.
> > @@ -473,6 +491,15 @@ extern struct pglist_data contig_page_da
> > #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
> > #error Allocator MAX_ORDER exceeds SECTION_SIZE
> > #endif
> > +#if ((SECTION_SIZE_BITS - MAX_ORDER) * BITS_PER_RCLM_TYPE) > 64
> > +#error free_area_usemap is not big enough
> > +#endif
>
> Every time I look at these patches, I see this #if, and I don't remember
> what that '64' means. Can it please get a real name?
>
Joel, suggestions?
> > +/* Usemap initialisation */
> > +#ifdef CONFIG_SPARSEMEM
> > +static inline void setup_usemap(struct pglist_data *pgdat,
> > + struct zone *zone, unsigned long zonesize) {}
> > +#endif /* CONFIG_SPARSEMEM */
> >
> > struct page;
> > struct mem_section {
> > @@ -485,6 +512,7 @@ struct mem_section {
> > * before using it wrong.
> > */
> > unsigned long section_mem_map;
> > + DECLARE_BITMAP(free_area_usemap,64);
> > };
>
> There's that '64' again! You need a space after the comma, too.
>
> > #ifdef CONFIG_SPARSEMEM_EXTREME
> > @@ -552,6 +580,17 @@ static inline struct mem_section *__pfn_
> > return __nr_to_section(pfn_to_section_nr(pfn));
> > }
> >
> > +static inline unsigned long *pfn_to_usemap(struct zone *zone, unsigned long pfn)
> > +{
> > + return &__pfn_to_section(pfn)->free_area_usemap[0];
> > +}
> > +
> > +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
> > +{
> > + pfn &= (PAGES_PER_SECTION-1);
> > + return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
> > +}
>
> Why does that return int? Should it be "unsigned long", maybe? Also,
> that cast is implicit in the return and shouldn't be needed.
>
It returns int because the bit functions like assign_bit() expect an int
for the bit index, not an unsigned long or anything else.
> > #define pfn_to_page(pfn) \
> > ({ \
> > unsigned long __pfn = (pfn); \
> > @@ -589,6 +628,16 @@ void sparse_init(void);
> > #else
> > #define sparse_init() do {} while (0)
> > #define sparse_index_init(_sec, _nid) do {} while (0)
> > +static inline unsigned long *pfn_to_usemap(struct zone *zone,
> > + unsigned long pfn)
> > +{
> > + return (zone->free_area_usemap);
> > +}
>
> "return" is not a function. The parenthesis can go away.
>
> > +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
> > +{
> > + pfn = pfn - zone->zone_start_pfn;
> > + return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
> > +}
>
> Again, why the cast?
>
Same reason as above, bit functions expect an int for the index.
> > #endif /* CONFIG_SPARSEMEM */
> >
> > #ifdef CONFIG_NODES_SPAN_OTHER_NODES
> > diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-001_antidefrag_flags/mm/page_alloc.c linux-2.6.14-rc3-002_usemap/mm/page_alloc.c
> > --- linux-2.6.14-rc3-001_antidefrag_flags/mm/page_alloc.c 2005-10-04 22:58:34.000000000 +0100
> > +++ linux-2.6.14-rc3-002_usemap/mm/page_alloc.c 2005-10-11 12:07:26.000000000 +0100
> > @@ -66,6 +66,88 @@ EXPORT_SYMBOL(totalram_pages);
> > EXPORT_SYMBOL(nr_swap_pages);
> >
> > /*
> > + * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
> > + * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
> > + * made afterwards in case the GFP flags are not updated without updating
> > + * this number
> > + */
> > +#define RCLM_SHIFT 19
> > +#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
> > +#error __GFP_USER not mapping to RCLM_USER
> > +#endif
> > +#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
> > +#error __GFP_KERNRCLM not mapping to RCLM_KERN
> > +#endif
>
> Should this really be in page_alloc.c, or should it be close to the
> RCLM_* definitions?
>
I can't test it right now, but I think the reason it is here is because
RCLM_* and __GFP_* are in different headers that are not aware of each
other. This is the place a static compile-time check can be made.
> > +/*
> > + * This function maps gfpflags to their RCLM_TYPE. It makes assumptions
> > + * on the location of the GFP flags
> > + */
> > +static inline unsigned long gfpflags_to_rclmtype(unsigned long gfp_flags) {
> > + return (gfp_flags & __GFP_RCLM_BITS) >> RCLM_SHIFT;
> > +}
>
> First brace on a new line, please.
>
> > +/*
> > + * copy_bits - Copy bits between bitmaps
> > + * @dstaddr: The destination bitmap to copy to
> > + * @srcaddr: The source bitmap to copy from
> > + * @sindex_dst: The start bit index within the destination map to copy to
> > + * @sindex_src: The start bit index within the source map to copy from
> > + * @nr: The number of bits to copy
> > + */
> > +static inline void copy_bits(unsigned long *dstaddr,
> > + unsigned long *srcaddr,
> > + int sindex_dst,
> > + int sindex_src,
> > + int nr)
> > +{
> > + /*
> > + * Written like this to take advantage of arch-specific
> > + * set_bit() and clear_bit() functions
> > + */
> > + for (nr = nr-1; nr >= 0; nr--) {
> > + int bit = test_bit(sindex_src + nr, srcaddr);
> > + if (bit)
> > + set_bit(sindex_dst + nr, dstaddr);
> > + else
> > + clear_bit(sindex_dst + nr, dstaddr);
> > + }
> > +}
>
> We might want to make a note that this is slow, and doesn't have any
> atomicity guarantees, either.
The comment can be put in, but this is only called with a spinlock held.
No arguements about it being slow though.
> But, it's functional, and certainly
> describes the operation that we wanted. It's an improvement over what
> was there.
>
> BTW, this could probably be done with some fancy bitmask operations, and
> be more efficient.
>
Very likely but I was worred about corner cases if the value of
BITS_PER_RCLM_TYPE changed later to 3 or something. This time, I was going
for the safe choice.
> > +int get_pageblock_type(struct zone *zone,
> > + struct page *page)
> > +{
> > + unsigned long pfn = page_to_pfn(page);
> > + unsigned long type = 0;
> > + unsigned long *usemap;
> > + int bitidx;
> > +
> > + bitidx = pfn_to_bitidx(zone, pfn);
> > + usemap = pfn_to_usemap(zone, pfn);
> > +
> > + copy_bits(&type, usemap, 0, bitidx, BITS_PER_RCLM_TYPE);
> > +
> > + return (int)type;
> > +}
>
> Where did all these casts come from?
>
It was pointed out that type used for use with the bit functions should
all be unsigned long, not int as they were previously. However, I found if
I used unsigned long throughout the code, including for array operations,
there was a 10-12% slowdown in AIM9. These casts were the compromise.
alloctype is unsigned long when used with the functions like assign_bit()
but int every other time.
In this case, there is an implicit cast so the cast is redundent if that
is the problem you are pointing out. I can remove the explicit casts that
are dotted around the place.
> > -static struct page *__rmqueue(struct zone *zone, unsigned int order)
> > +static struct page *__rmqueue(struct zone *zone, unsigned int order,
> > + int alloctype)
> > {
> > struct free_area * area;
> > unsigned int current_order;
> > @@ -486,6 +569,15 @@ static struct page *__rmqueue(struct zon
> > rmv_page_order(page);
> > area->nr_free--;
> > zone->free_pages -= 1UL << order;
> > +
> > + /*
> > + * If splitting a large block, record what the block is being
> > + * used for in the usemap
> > + */
> > + if (current_order == MAX_ORDER-1)
> > + set_pageblock_type(zone, page,
> > + (unsigned long)alloctype);
>
> This cast makes that line wrap, and makes it quite a bit less readable.
>
> > return expand(zone, page, order, current_order, area);
> > }
> >
> > @@ -498,7 +590,8 @@ static struct page *__rmqueue(struct zon
> > * Returns the number of new pages which were placed at *list.
> > */
> > static int rmqueue_bulk(struct zone *zone, unsigned int order,
> > - unsigned long count, struct list_head *list)
> > + unsigned long count, struct list_head *list,
> > + int alloctype)
> > {
> > unsigned long flags;
> > int i;
> > @@ -507,7 +600,7 @@ static int rmqueue_bulk(struct zone *zon
> >
> > spin_lock_irqsave(&zone->lock, flags);
> > for (i = 0; i < count; ++i) {
> > - page = __rmqueue(zone, order);
> > + page = __rmqueue(zone, order, alloctype);
> > if (page == NULL)
> > break;
> > allocated++;
> > @@ -691,7 +784,8 @@ buffered_rmqueue(struct zone *zone, int
> > unsigned long flags;
> > struct page *page = NULL;
> > int cold = !!(gfp_flags & __GFP_COLD);
> > -
> > + int alloctype = (int)gfpflags_to_rclmtype(gfp_flags);
> > +
>
> Just a note: that inserts whitespace.
>
> You can avoid these kinds of mistakes by keeping incremental patches.
> Start with your last set of work, and keep all changes to _that_ work in
> separate patches on top of the old. You'll see only what you change and
> little tiny mishaps like this will tend to jump out more than in the
> hundreds of lines of the whole thing.
>
Understood.
--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-13 14:10 ` Mel Gorman
@ 2005-10-13 14:18 ` Dave Hansen
2005-10-13 14:35 ` Mel Gorman
2005-10-16 20:44 ` Mel Gorman
2005-10-13 16:33 ` Joel Schopp
1 sibling, 2 replies; 27+ messages in thread
From: Dave Hansen @ 2005-10-13 14:18 UTC (permalink / raw)
To: Mel Gorman
Cc: Andrew Morton, jschopp, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Thu, 2005-10-13 at 15:10 +0100, Mel Gorman wrote:
> On Thu, 13 Oct 2005, Dave Hansen wrote:
> > > +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
> > > +{
> > > + pfn &= (PAGES_PER_SECTION-1);
> > > + return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
> > > +}
> >
> > Why does that return int? Should it be "unsigned long", maybe? Also,
> > that cast is implicit in the return and shouldn't be needed.
> >
>
> It returns int because the bit functions like assign_bit() expect an int
> for the bit index, not an unsigned long or anything else.
You don't need to explicitly cast between int and unsigned long. It'll
probably hide more bugs than it reveals.
> > > /*
> > > + * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
> > > + * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
> > > + * made afterwards in case the GFP flags are not updated without updating
> > > + * this number
> > > + */
> > > +#define RCLM_SHIFT 19
> > > +#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
> > > +#error __GFP_USER not mapping to RCLM_USER
> > > +#endif
> > > +#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
> > > +#error __GFP_KERNRCLM not mapping to RCLM_KERN
> > > +#endif
> >
> > Should this really be in page_alloc.c, or should it be close to the
> > RCLM_* definitions?
>
> I can't test it right now, but I think the reason it is here is because
> RCLM_* and __GFP_* are in different headers that are not aware of each
> other. This is the place a static compile-time check can be made.
Well, they're pretty intricately linked, so maybe they should go in the
same header, no?
> It was pointed out that type used for use with the bit functions should
> all be unsigned long, not int as they were previously. However, I found if
> I used unsigned long throughout the code, including for array operations,
> there was a 10-12% slowdown in AIM9. These casts were the compromise.
> alloctype is unsigned long when used with the functions like assign_bit()
> but int every other time.
Why does it slow down? Do you have any detailed profiles?
> In this case, there is an implicit cast so the cast is redundent if that
> is the problem you are pointing out. I can remove the explicit casts that
> are dotted around the place.
There needs to be a reason for the casts. They certainly don't help
readability or correctness, so there needs to be some justification. If
there are performance reasons somehow, they need to be analyzed as well.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-13 14:18 ` Dave Hansen
@ 2005-10-13 14:35 ` Mel Gorman
2005-10-16 20:44 ` Mel Gorman
1 sibling, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-13 14:35 UTC (permalink / raw)
To: Dave Hansen
Cc: Andrew Morton, jschopp, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Thu, 13 Oct 2005, Dave Hansen wrote:
> On Thu, 2005-10-13 at 15:10 +0100, Mel Gorman wrote:
> > On Thu, 13 Oct 2005, Dave Hansen wrote:
> > > > +static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
> > > > +{
> > > > + pfn &= (PAGES_PER_SECTION-1);
> > > > + return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
> > > > +}
> > >
> > > Why does that return int? Should it be "unsigned long", maybe? Also,
> > > that cast is implicit in the return and shouldn't be needed.
> > >
> >
> > It returns int because the bit functions like assign_bit() expect an int
> > for the bit index, not an unsigned long or anything else.
>
> You don't need to explicitly cast between int and unsigned long. It'll
> probably hide more bugs than it reveals.
>
Ok
> > > > /*
> > > > + * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
> > > > + * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
> > > > + * made afterwards in case the GFP flags are not updated without updating
> > > > + * this number
> > > > + */
> > > > +#define RCLM_SHIFT 19
> > > > +#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
> > > > +#error __GFP_USER not mapping to RCLM_USER
> > > > +#endif
> > > > +#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
> > > > +#error __GFP_KERNRCLM not mapping to RCLM_KERN
> > > > +#endif
> > >
> > > Should this really be in page_alloc.c, or should it be close to the
> > > RCLM_* definitions?
> >
> > I can't test it right now, but I think the reason it is here is because
> > RCLM_* and __GFP_* are in different headers that are not aware of each
> > other. This is the place a static compile-time check can be made.
>
> Well, they're pretty intricately linked, so maybe they should go in the
> same header, no?
>
Will investigate. I can't at the moment.
> > It was pointed out that type used for use with the bit functions should
> > all be unsigned long, not int as they were previously. However, I found if
> > I used unsigned long throughout the code, including for array operations,
> > there was a 10-12% slowdown in AIM9. These casts were the compromise.
> > alloctype is unsigned long when used with the functions like assign_bit()
> > but int every other time.
>
> Why does it slow down? Do you have any detailed profiles?
>
I have no idea, it made no sense to me at all. I did find that it was
only in the pcpu code that really suffered but I didn't figure out why.
Next time I am testing (probably Monday), I'll gather the profiles.
> > In this case, there is an implicit cast so the cast is redundent if that
> > is the problem you are pointing out. I can remove the explicit casts that
> > are dotted around the place.
>
> There needs to be a reason for the casts. They certainly don't help
> readability or correctness, so there needs to be some justification. If
> there are performance reasons somehow, they need to be analyzed as well.
>
I'll recheck it.
--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-13 14:18 ` Dave Hansen
2005-10-13 14:35 ` Mel Gorman
@ 2005-10-16 20:44 ` Mel Gorman
1 sibling, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-16 20:44 UTC (permalink / raw)
To: Dave Hansen; +Cc: jschopp, kravetz, Linux Kernel Mailing List, linux-mm, lhms
On Thu, 13 Oct 2005, Dave Hansen wrote:
> > > > + * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
> > > > + * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
> > > > + * made afterwards in case the GFP flags are not updated without updating
> > > > + * this number
> > > > + */
> > > > +#define RCLM_SHIFT 19
> > > > +#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
> > > > +#error __GFP_USER not mapping to RCLM_USER
> > > > +#endif
> > > > +#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
> > > > +#error __GFP_KERNRCLM not mapping to RCLM_KERN
> > > > +#endif
> > >
> > > Should this really be in page_alloc.c, or should it be close to the
> > > RCLM_* definitions?
> >
> > I can't test it right now, but I think the reason it is here is because
> > RCLM_* and __GFP_* are in different headers that are not aware of each
> > other. This is the place a static compile-time check can be made.
>
> Well, they're pretty intricately linked, so maybe they should go in the
> same header, no?
>
I looked into this more and I did have a good reason for putting them in
different headers. The __GFP_* flags have to be defined with the other GFP
flags, it just does not make sense otherwise. The RCLM_* flags must be
with the definition of struct zone * because there determine the number of
free lists that exist in the zone. There is no obvious way to have the
RCLM_* and __GFP_* flags in the same place unless mmzone.h includes gfp.h
which I seriously doubt we want.
The value of RCLM_SHIFT depends on both __GFP_* and RCLM_* so it needs to
be defined in a place that can see both gfp.h and mmzone.h . As
page_alloc.c is the only user of RCLM_SHIFT, it made sense to define it
there.
Is there a clearer way of doing this?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap
2005-10-13 14:10 ` Mel Gorman
2005-10-13 14:18 ` Dave Hansen
@ 2005-10-13 16:33 ` Joel Schopp
1 sibling, 0 replies; 27+ messages in thread
From: Joel Schopp @ 2005-10-13 16:33 UTC (permalink / raw)
To: Mel Gorman
Cc: Dave Hansen, Andrew Morton, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
>>>@@ -473,6 +491,15 @@ extern struct pglist_data contig_page_da
>>> #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
>>> #error Allocator MAX_ORDER exceeds SECTION_SIZE
>>> #endif
>>>+#if ((SECTION_SIZE_BITS - MAX_ORDER) * BITS_PER_RCLM_TYPE) > 64
>>>+#error free_area_usemap is not big enough
>>>+#endif
>>
>>Every time I look at these patches, I see this #if, and I don't remember
>>what that '64' means. Can it please get a real name?
>>
>
>
> Joel, suggestions?
Oh yeah, blame it on me just because I wrote that bit of code. How about
#define FREE_AREA_USEMAP_SIZE 64
>
>
>>>+/* Usemap initialisation */
>>>+#ifdef CONFIG_SPARSEMEM
>>>+static inline void setup_usemap(struct pglist_data *pgdat,
>>>+ struct zone *zone, unsigned long zonesize) {}
>>>+#endif /* CONFIG_SPARSEMEM */
>>>
>>> struct page;
>>> struct mem_section {
>>>@@ -485,6 +512,7 @@ struct mem_section {
>>> * before using it wrong.
>>> */
>>> unsigned long section_mem_map;
>>>+ DECLARE_BITMAP(free_area_usemap,64);
>>> };
>>
>>There's that '64' again! You need a space after the comma, too.
Ditto.
>>>+ * RCLM_SHIFT is the number of bits that a gfp_mask has to be shifted right
>>>+ * to have just the __GFP_USER and __GFP_KERNRCLM bits. The static check is
>>>+ * made afterwards in case the GFP flags are not updated without updating
>>>+ * this number
>>>+ */
>>>+#define RCLM_SHIFT 19
>>>+#if (__GFP_USER >> RCLM_SHIFT) != RCLM_USER
>>>+#error __GFP_USER not mapping to RCLM_USER
>>>+#endif
>>>+#if (__GFP_KERNRCLM >> RCLM_SHIFT) != RCLM_KERN
>>>+#error __GFP_KERNRCLM not mapping to RCLM_KERN
>>>+#endif
>>
>>Should this really be in page_alloc.c, or should it be close to the
>>RCLM_* definitions?
I had the same first impression, but concluded this was the best place. The
compile time checks should keep things from getting out of sync.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 3/8] Fragmentation Avoidance V17: 003_fragcore
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
2005-10-11 15:12 ` [PATCH 1/8] Fragmentation Avoidance V17: 001_antidefrag_flags Mel Gorman
2005-10-11 15:12 ` [PATCH 2/8] Fragmentation Avoidance V17: 002_usemap Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-11 15:12 ` [PATCH 4/8] Fragmentation Avoidance V17: 004_markfree Mel Gorman
` (5 subsequent siblings)
8 siblings, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
This patch adds the core of the anti-fragmentation strategy. It works by
grouping related allocation types together. The idea is that large groups of
pages that may be reclaimed are placed near each other. The zone->free_area
list is broken into three free lists for each RCLM_TYPE.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-002_usemap/include/linux/mmzone.h linux-2.6.14-rc3-003_fragcore/include/linux/mmzone.h
--- linux-2.6.14-rc3-002_usemap/include/linux/mmzone.h 2005-10-11 12:07:26.000000000 +0100
+++ linux-2.6.14-rc3-003_fragcore/include/linux/mmzone.h 2005-10-11 12:08:07.000000000 +0100
@@ -32,6 +32,10 @@
#define RCLM_TYPES 3
#define BITS_PER_RCLM_TYPE 2
+#define for_each_rclmtype_order(type, order) \
+ for (order = 0; order < MAX_ORDER; order++) \
+ for (type = 0; type < RCLM_TYPES; type++)
+
struct free_area {
struct list_head free_list;
unsigned long nr_free;
@@ -148,7 +152,6 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];
#ifndef CONFIG_SPARSEMEM
/*
@@ -158,6 +161,8 @@ struct zone {
unsigned long *free_area_usemap;
#endif
+ struct free_area free_area_lists[RCLM_TYPES][MAX_ORDER];
+
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-002_usemap/mm/page_alloc.c linux-2.6.14-rc3-003_fragcore/mm/page_alloc.c
--- linux-2.6.14-rc3-002_usemap/mm/page_alloc.c 2005-10-11 12:07:26.000000000 +0100
+++ linux-2.6.14-rc3-003_fragcore/mm/page_alloc.c 2005-10-11 12:08:07.000000000 +0100
@@ -314,6 +314,16 @@ __find_combined_index(unsigned long page
}
/*
+ * Return the free list for a given page within a zone
+ */
+static inline struct free_area *__page_find_freelist(struct zone *zone,
+ struct page *page)
+{
+ int type = (int)get_pageblock_type(zone, page);
+ return zone->free_area_lists[type];
+}
+
+/*
* This function checks whether a page is free && is the buddy
* we can do coalesce a page and its buddy if
* (a) the buddy is free &&
@@ -361,6 +371,8 @@ static inline void __free_pages_bulk (st
{
unsigned long page_idx;
int order_size = 1 << order;
+ struct free_area *area;
+ struct free_area *freelist;
if (unlikely(order))
destroy_compound_page(page, order);
@@ -370,10 +382,11 @@ static inline void __free_pages_bulk (st
BUG_ON(page_idx & (order_size - 1));
BUG_ON(bad_range(zone, page));
+ freelist = __page_find_freelist(zone, page);
+
zone->free_pages += order_size;
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
- struct free_area *area;
struct page *buddy;
combined_idx = __find_combined_index(page_idx, order);
@@ -384,7 +397,7 @@ static inline void __free_pages_bulk (st
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */
list_del(&buddy->lru);
- area = zone->free_area + order;
+ area = &(freelist[order]);
area->nr_free--;
rmv_page_order(buddy);
page = page + (combined_idx - page_idx);
@@ -392,8 +405,8 @@ static inline void __free_pages_bulk (st
order++;
}
set_page_order(page, order);
- list_add(&page->lru, &zone->free_area[order].free_list);
- zone->free_area[order].nr_free++;
+ list_add_tail(&page->lru, &freelist[order].free_list);
+ freelist[order].nr_free++;
}
static inline void free_pages_check(const char *function, struct page *page)
@@ -548,6 +561,47 @@ static void prep_new_page(struct page *p
kernel_map_pages(page, 1 << order, 1);
}
+/*
+ * Find a list that has a 2^MAX_ORDER-1 block of pages available and
+ * return it
+ */
+struct page* steal_largepage(struct zone *zone, int alloctype)
+{
+ struct page *page;
+ struct free_area *area = NULL;
+ int i = 0;
+
+ for(i = 0; i < RCLM_TYPES; i++) {
+ if (i == alloctype)
+ continue;
+
+ area = &zone->free_area_lists[i][MAX_ORDER-1];
+ if (!list_empty(&area->free_list))
+ break;
+ }
+ if (i == RCLM_TYPES)
+ return NULL;
+
+ page = list_entry(area->free_list.next, struct page, lru);
+ area->nr_free--;
+
+ set_pageblock_type(zone, page, (unsigned long)alloctype);
+
+ return page;
+}
+
+
+static inline struct page *
+remove_page(struct zone *zone, struct page *page, unsigned int order,
+ unsigned int current_order, struct free_area *area)
+{
+ list_del(&page->lru);
+ rmv_page_order(page);
+ zone->free_pages -= 1UL << order;
+ return expand(zone, page, order, current_order, area);
+
+}
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
@@ -555,32 +609,25 @@ static void prep_new_page(struct page *p
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int alloctype)
{
- struct free_area * area;
+ struct free_area * area = NULL;
unsigned int current_order;
struct page *page;
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
- area = zone->free_area + current_order;
+ area = &(zone->free_area_lists[alloctype][current_order]);
if (list_empty(&area->free_list))
continue;
page = list_entry(area->free_list.next, struct page, lru);
- list_del(&page->lru);
- rmv_page_order(page);
area->nr_free--;
- zone->free_pages -= 1UL << order;
-
- /*
- * If splitting a large block, record what the block is being
- * used for in the usemap
- */
- if (current_order == MAX_ORDER-1)
- set_pageblock_type(zone, page,
- (unsigned long)alloctype);
-
- return expand(zone, page, order, current_order, area);
+ return remove_page(zone, page, order, current_order, area);
}
+ /* Allocate a MAX_ORDER block */
+ page = steal_largepage(zone, alloctype);
+ if (page != NULL)
+ return remove_page(zone, page, order, MAX_ORDER-1, area);
+
return NULL;
}
@@ -666,9 +713,9 @@ static void __drain_pages(unsigned int c
void mark_free_pages(struct zone *zone)
{
unsigned long zone_pfn, flags;
- int order;
+ int order, t;
+ unsigned long start_pfn, i;
struct list_head *curr;
-
if (!zone->spanned_pages)
return;
@@ -676,14 +723,12 @@ void mark_free_pages(struct zone *zone)
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
- for (order = MAX_ORDER - 1; order >= 0; --order)
- list_for_each(curr, &zone->free_area[order].free_list) {
- unsigned long start_pfn, i;
-
+ for_each_rclmtype_order(t, order) {
+ list_for_each(curr,&zone->free_area_lists[t][order].free_list){
start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
-
for (i=0; i < (1<<order); i++)
SetPageNosaveFree(pfn_to_page(start_pfn+i));
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -834,6 +879,7 @@ int zone_watermark_ok(struct zone *z, in
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
int o;
+ struct free_area *kernnorclm, *kernrclm, *userrclm;
if (gfp_high)
min -= min / 2;
@@ -842,15 +888,21 @@ int zone_watermark_ok(struct zone *z, in
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return 0;
+ kernnorclm = z->free_area_lists[RCLM_NORCLM];
+ kernrclm = z->free_area_lists[RCLM_KERN];
+ userrclm = z->free_area_lists[RCLM_USER];
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
-
+ free_pages -= (kernnorclm->nr_free + kernrclm->nr_free +
+ userrclm->nr_free) << o;
/* Require fewer higher order pages to be free */
min >>= 1;
if (free_pages <= min)
return 0;
+ kernnorclm++;
+ kernrclm++;
+ userrclm++;
}
return 1;
}
@@ -1389,6 +1441,7 @@ void show_free_areas(void)
unsigned long inactive;
unsigned long free;
struct zone *zone;
+ int type;
for_each_zone(zone) {
show_node(zone);
@@ -1471,7 +1524,9 @@ void show_free_areas(void)
}
for_each_zone(zone) {
- unsigned long nr, flags, order, total = 0;
+ unsigned long nr = 0;
+ unsigned long total = 0;
+ unsigned long flags,order;
show_node(zone);
printk("%s: ", zone->name);
@@ -1481,10 +1536,18 @@ void show_free_areas(void)
}
spin_lock_irqsave(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++) {
- nr = zone->free_area[order].nr_free;
+ for_each_rclmtype_order(type, order) {
+ nr += zone->free_area_lists[type][order].nr_free;
total += nr << order;
- printk("%lu*%lukB ", nr, K(1UL) << order);
+
+ /*
+ * if type had reached RCLM_TYPE, the free pages
+ * for this order have been summed up
+ */
+ if (type == RCLM_TYPES-1) {
+ printk("%lu*%lukB ", nr, K(1UL) << order);
+ nr = 0;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
printk("= %lukB\n", K(total));
@@ -1784,9 +1847,14 @@ void zone_init_free_lists(struct pglist_
unsigned long size)
{
int order;
- for (order = 0; order < MAX_ORDER ; order++) {
- INIT_LIST_HEAD(&zone->free_area[order].free_list);
- zone->free_area[order].nr_free = 0;
+ int type;
+ struct free_area *area;
+
+ /* Initialse the three size ordered lists of free_areas */
+ for_each_rclmtype_order(type, order) {
+ area = &(zone->free_area_lists[type][order]);
+ INIT_LIST_HEAD(&area->free_list);
+ area->nr_free = 0;
}
}
@@ -2178,16 +2246,26 @@ static int frag_show(struct seq_file *m,
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
- int order;
+ int order, type;
+ struct free_area *area;
+ unsigned long nr_bufs = 0;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!zone->present_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
- seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
- for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ for_each_rclmtype_order(type, order) {
+ area = &(zone->free_area_lists[type][order]);
+ nr_bufs += area->nr_free;
+
+ if (type == RCLM_TYPES-1) {
+ seq_printf(m, "%6lu ", nr_bufs);
+ nr_bufs = 0;
+ }
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [PATCH 4/8] Fragmentation Avoidance V17: 004_markfree
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
` (2 preceding siblings ...)
2005-10-11 15:12 ` [PATCH 3/8] Fragmentation Avoidance V17: 003_fragcore Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-11 15:12 ` [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback Mel Gorman
` (4 subsequent siblings)
8 siblings, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
This patch alters show_free_areas() to print out the number of free pages
for each reclaim type. Without this patch, only an aggregate number is
displayed. Before this patch, the output of show_free_area() would include
something like;
DMA: 2*4kB 1*8kB 5*16kB 3*32kB 3*64kB 3*128kB 2*256kB 0*512kB 1*1024kB 1*2048kB 2*4096kB = 12544kB
Normal: 34*4kB 57*8kB 14*16kB 4*32kB 4*64kB 2*128kB 2*256kB 2*512kB 2*1024kB 2*2048kB 210*4096kB = 869296kB
HighMem: 1*4kB 0*8kB 15*16kB 23*32kB 11*64kB 10*128kB 2*256kB 2*512kB 1*1024kB 1*2048kB 153*4096kB = 634260kB
After, it shows something like;
DMA: (2+0+0+0)2*4kB (1+0+0+0)1*8kB (5+0+0+0)5*16kB (3+0+0+0)3*32kB (3+0+0+0)3*64kB (3+0+0+0)3*128kB (2+0+0+0)2*256kB (0+0+0+0)0*512kB (1+0+0+0)1*1024kB (1+0+0+0)1*2048kB (2+0+0+0)2*4096kB = 12544kB
Normal: (21+0+13+0)34*4kB (52+1+4+0)57*8kB (12+0+2+0)14*16kB (2+1+1+0)4*32kB (3+1+0+0)4*64kB (1+0+1+0)2*128kB (1+1+0+0)2*256kB (1+1+0+0)2*512kB (1+0+1+0)2*1024kB (1+0+1+0)2*2048kB (210+0+0+0)210*4096kB = 869296kB
HighMem: (1+0+0+0)1*4kB (0+0+0+0)0*8kB (0+15+0+0)15*16kB (1+22+0+0)23*32kB (0+11+0+0)11*64kB (2+8+0+0)10*128kB (0+2+0+0)2*256kB (0+2+0+0)2*512kB (0+1+0+0)1*1024kB (1+0+0+0)1*2048kB (153+0+0+0)153*4096kB = 634260kB
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-003_fragcore/mm/page_alloc.c linux-2.6.14-rc3-004_markfree/mm/page_alloc.c
--- linux-2.6.14-rc3-003_fragcore/mm/page_alloc.c 2005-10-11 12:08:07.000000000 +0100
+++ linux-2.6.14-rc3-004_markfree/mm/page_alloc.c 2005-10-11 12:08:47.000000000 +0100
@@ -1524,12 +1524,14 @@ void show_free_areas(void)
}
for_each_zone(zone) {
- unsigned long nr = 0;
+ unsigned long tnr = 0;
unsigned long total = 0;
- unsigned long flags,order;
+ unsigned long nr,flags,order;
+
+
show_node(zone);
- printk("%s: ", zone->name);
+ printk("%s: (", zone->name);
if (!zone->present_pages) {
printk("empty\n");
continue;
@@ -1537,17 +1539,21 @@ void show_free_areas(void)
spin_lock_irqsave(&zone->lock, flags);
for_each_rclmtype_order(type, order) {
- nr += zone->free_area_lists[type][order].nr_free;
+ nr = zone->free_area_lists[type][order].nr_free;
+ tnr += nr;
total += nr << order;
+ printk("%lu", nr);
/*
* if type had reached RCLM_TYPE, the free pages
* for this order have been summed up
*/
if (type == RCLM_TYPES-1) {
- printk("%lu*%lukB ", nr, K(1UL) << order);
+ printk(")%lu*%lukB %s", tnr, K(1UL) << order,
+ order == MAX_ORDER-1 ? "" : "(");
nr = 0;
- }
+ } else
+ printk("+");
}
spin_unlock_irqrestore(&zone->lock, flags);
printk("= %lukB\n", K(total));
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
` (3 preceding siblings ...)
2005-10-11 15:12 ` [PATCH 4/8] Fragmentation Avoidance V17: 004_markfree Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-12 16:43 ` mike kravetz
2005-10-11 15:12 ` [PATCH 6/8] Fragmentation Avoidance V17: 006_largealloc_tryharder Mel Gorman
` (3 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
This patch implements fallback logic. In the event there is no 2^(MAX_ORDER-1)
blocks of pages left, this will help the system decide what list to use. The
highlights of the patch are;
o Define a RCLM_FALLBACK type for fallbacks
o Use a percentage of each zone for fallbacks. When a reserved pool of pages
is depleted, it will try and use RCLM_FALLBACK before using anything else.
This greatly reduces the amount of fallbacks causing fragmentation without
needing complex balancing algorithms
o Add a fallback_reserve and fallback_balance so that the system knows how
may 2^(MAX_ORDER-1) blocks are being used for fallbacks and if more need
to be reserved.
o Adds a fallback_allocs[] array that determines the order of freelists are
used for each allocation type
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-004_markfree/include/linux/mmzone.h linux-2.6.14-rc3-005_fallback/include/linux/mmzone.h
--- linux-2.6.14-rc3-004_markfree/include/linux/mmzone.h 2005-10-11 12:08:07.000000000 +0100
+++ linux-2.6.14-rc3-005_fallback/include/linux/mmzone.h 2005-10-11 12:09:27.000000000 +0100
@@ -29,7 +29,8 @@
#define RCLM_NORCLM 0
#define RCLM_USER 1
#define RCLM_KERN 2
-#define RCLM_TYPES 3
+#define RCLM_FALLBACK 3
+#define RCLM_TYPES 4
#define BITS_PER_RCLM_TYPE 2
#define for_each_rclmtype_order(type, order) \
@@ -163,6 +164,13 @@ struct zone {
struct free_area free_area_lists[RCLM_TYPES][MAX_ORDER];
+ /*
+ * Track what percentage of the zone should be used for fallbacks and
+ * how much is being currently used
+ */
+ unsigned long fallback_reserve;
+ long fallback_balance;
+
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
@@ -275,6 +283,17 @@ struct zonelist {
struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
};
+static inline void inc_reserve_count(struct zone* zone, int type)
+{
+ if (type == RCLM_FALLBACK)
+ zone->fallback_reserve++;
+}
+
+static inline void dec_reserve_count(struct zone* zone, int type)
+{
+ if (type == RCLM_FALLBACK && zone->fallback_reserve)
+ zone->fallback_reserve--;
+}
/*
* The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-004_markfree/mm/page_alloc.c linux-2.6.14-rc3-005_fallback/mm/page_alloc.c
--- linux-2.6.14-rc3-004_markfree/mm/page_alloc.c 2005-10-11 12:08:47.000000000 +0100
+++ linux-2.6.14-rc3-005_fallback/mm/page_alloc.c 2005-10-11 12:09:27.000000000 +0100
@@ -53,6 +53,40 @@ unsigned long totalhigh_pages __read_mos
long nr_swap_pages;
/*
+ * fallback_allocs contains the fallback types for low memory conditions
+ * where the preferred alloction type if not available.
+ */
+int fallback_allocs[RCLM_TYPES][RCLM_TYPES+1] = {
+ {RCLM_NORCLM, RCLM_FALLBACK, RCLM_KERN, RCLM_USER, RCLM_TYPES},
+ {RCLM_KERN, RCLM_FALLBACK, RCLM_NORCLM, RCLM_USER, RCLM_TYPES},
+ {RCLM_USER, RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_TYPES},
+ {RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_USER, RCLM_TYPES}
+};
+
+/*
+ * Returns 1 if the required presentage of the zone if reserved for fallbacks
+ *
+ * fallback_balance and fallback_reserve are used to detect when the required
+ * percentage is reserved. fallback_balance is decremented when a
+ * 2^(MAX_ORDER-1) block is split and incremented when coalesced.
+ * fallback_reserve is incremented when a block is reserved for fallbacks
+ * and decremented when reassigned elsewhere.
+ *
+ */
+static inline int min_fallback_reserved(struct zone *zone) {
+ /* If fallback_balance is positive, we do not need to reserve */
+ if (zone->fallback_balance > 0)
+ return 1;
+
+ /*
+ * When fallback_balance is negative, a reserve is required. The number
+ * of reserved blocks required is related to the negative value of
+ * fallback_balance
+ */
+ return -(zone->fallback_balance) < zone->fallback_reserve;
+}
+
+/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
@@ -404,6 +438,8 @@ static inline void __free_pages_bulk (st
page_idx = combined_idx;
order++;
}
+ if (unlikely(order == MAX_ORDER-1))
+ zone->fallback_balance++;
set_page_order(page, order);
list_add_tail(&page->lru, &freelist[order].free_list);
freelist[order].nr_free++;
@@ -585,7 +621,12 @@ struct page* steal_largepage(struct zone
page = list_entry(area->free_list.next, struct page, lru);
area->nr_free--;
+ if (!min_fallback_reserved(zone))
+ alloctype = RCLM_FALLBACK;
+
set_pageblock_type(zone, page, (unsigned long)alloctype);
+ dec_reserve_count(zone, i);
+ inc_reserve_count(zone, alloctype);
return page;
}
@@ -595,6 +636,8 @@ static inline struct page *
remove_page(struct zone *zone, struct page *page, unsigned int order,
unsigned int current_order, struct free_area *area)
{
+ if (unlikely(current_order == MAX_ORDER-1))
+ zone->fallback_balance--;
list_del(&page->lru);
rmv_page_order(page);
zone->free_pages -= 1UL << order;
@@ -602,6 +645,83 @@ remove_page(struct zone *zone, struct pa
}
+/*
+ * If we are falling back, and the allocation is KERNNORCLM,
+ * then reserve any buddies for the KERNNORCLM pool. These
+ * allocations fragment the worst so this helps keep them
+ * in the one place
+ */
+static inline struct free_area *
+fallback_buddy_reserve(int start_alloctype, struct zone *zone,
+ unsigned int current_order, struct page *page,
+ struct free_area *area)
+{
+ if (start_alloctype != RCLM_NORCLM)
+ return area;
+
+ area = &(zone->free_area_lists[RCLM_NORCLM][current_order]);
+
+ /* Reserve the whole block if this is a large split */
+ if (current_order >= MAX_ORDER / 2) {
+ int reserve_type = RCLM_NORCLM;
+ dec_reserve_count(zone, get_pageblock_type(zone,page));
+
+ /*
+ * Use this block for fallbacks if the
+ * minimum reserve is not being met
+ */
+ if (!min_fallback_reserved(zone))
+ reserve_type = RCLM_FALLBACK;
+
+ set_pageblock_type(zone, page, (unsigned long)reserve_type);
+ inc_reserve_count(zone, reserve_type);
+ }
+ return area;
+}
+
+static struct page *
+fallback_alloc(int alloctype, struct zone *zone, unsigned int order)
+{
+ int *fallback_list;
+ int start_alloctype = alloctype;
+ struct free_area *area;
+ unsigned int current_order;
+ struct page *page;
+ int i;
+
+ /* Ok, pick the fallback order based on the type */
+ BUG_ON(alloctype >= RCLM_TYPES);
+ fallback_list = fallback_allocs[alloctype];
+
+ /*
+ * Here, the alloc type lists has been depleted as well as the global
+ * pool, so fallback. When falling back, the largest possible block
+ * will be taken to keep the fallbacks clustered if possible
+ */
+ for (i = 0; fallback_list[i] != RCLM_TYPES; i++) {
+ alloctype = fallback_list[i];
+
+ /* Find a block to allocate */
+ area = &(zone->free_area_lists[alloctype][MAX_ORDER-1]);
+ for (current_order = MAX_ORDER - 1; current_order > order;
+ current_order--, area--) {
+ if (list_empty(&area->free_list))
+ continue;
+
+ page = list_entry(area->free_list.next,
+ struct page, lru);
+ area->nr_free--;
+ area = fallback_buddy_reserve(start_alloctype, zone,
+ current_order, page, area);
+ return remove_page(zone, page, order,
+ current_order, area);
+
+ }
+ }
+
+ return NULL;
+}
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
@@ -628,7 +748,8 @@ static struct page *__rmqueue(struct zon
if (page != NULL)
return remove_page(zone, page, order, MAX_ORDER-1, area);
- return NULL;
+ /* Try falling back */
+ return fallback_alloc(alloctype, zone, order);
}
/*
@@ -2106,6 +2227,10 @@ static void __init free_area_init_core(s
spin_lock_init(&zone->lru_lock);
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->fallback_reserve = 0;
+
+ /* Set the balance so about 12.5% will be used for fallbacks */
+ zone->fallback_balance = -(realsize >> (MAX_ORDER+2));
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback
2005-10-11 15:12 ` [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback Mel Gorman
@ 2005-10-12 16:43 ` mike kravetz
2005-10-12 17:21 ` Mel Gorman
0 siblings, 1 reply; 27+ messages in thread
From: mike kravetz @ 2005-10-12 16:43 UTC (permalink / raw)
To: Mel Gorman; +Cc: akpm, jschopp, linux-kernel, linux-mm, lhms-devel
On Tue, Oct 11, 2005 at 04:12:47PM +0100, Mel Gorman wrote:
> This patch implements fallback logic. In the event there is no 2^(MAX_ORDER-1)
> blocks of pages left, this will help the system decide what list to use. The
> highlights of the patch are;
>
> o Define a RCLM_FALLBACK type for fallbacks
> o Use a percentage of each zone for fallbacks. When a reserved pool of pages
> is depleted, it will try and use RCLM_FALLBACK before using anything else.
> This greatly reduces the amount of fallbacks causing fragmentation without
> needing complex balancing algorithms
I'm having a little trouble seeing how adding a new type (RCLM_FALLBACK)
helps. Seems to me that pages put into the RCLM_FALLBACK area would have
gone to the global free list and available to anyone. I must be missing
something here.
> +int fallback_allocs[RCLM_TYPES][RCLM_TYPES+1] = {
> + {RCLM_NORCLM, RCLM_FALLBACK, RCLM_KERN, RCLM_USER, RCLM_TYPES},
> + {RCLM_KERN, RCLM_FALLBACK, RCLM_NORCLM, RCLM_USER, RCLM_TYPES},
> + {RCLM_USER, RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_TYPES},
> + {RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_USER, RCLM_TYPES}
Do you really need that last line? Can an allocation of type RCLM_FALLBACK
realy be made?
--
Mike
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback
2005-10-12 16:43 ` mike kravetz
@ 2005-10-12 17:21 ` Mel Gorman
2005-10-12 17:29 ` [Lhms-devel] " Joel Schopp
0 siblings, 1 reply; 27+ messages in thread
From: Mel Gorman @ 2005-10-12 17:21 UTC (permalink / raw)
To: mike kravetz; +Cc: akpm, jschopp, linux-kernel, linux-mm, lhms-devel
On Wed, 12 Oct 2005, mike kravetz wrote:
> On Tue, Oct 11, 2005 at 04:12:47PM +0100, Mel Gorman wrote:
> > This patch implements fallback logic. In the event there is no 2^(MAX_ORDER-1)
> > blocks of pages left, this will help the system decide what list to use. The
> > highlights of the patch are;
> >
> > o Define a RCLM_FALLBACK type for fallbacks
> > o Use a percentage of each zone for fallbacks. When a reserved pool of pages
> > is depleted, it will try and use RCLM_FALLBACK before using anything else.
> > This greatly reduces the amount of fallbacks causing fragmentation without
> > needing complex balancing algorithms
>
> I'm having a little trouble seeing how adding a new type (RCLM_FALLBACK)
> helps.
When a pool for allocations is depleted, it has to fallback to somewhere.
It will never be the case that the pools are just the right size and
balancing them would be *very* difficult. The RCLM_FALLBACK acts as a
buffer for fallbacks to give page reclaim a chance to free up pages in the
proper pools.
With stats enabled, you can see the fallback counts. Right now
inc_fallback_count() counts a "real" fallback when the requested pool and
RCLM_FALLBACK are depleted. If you alter inc_fallback_count() to always
count, you'll get an idea of how often RCLM_FALLBACK is used. Without the
fallback area, the strategy suffers pretty badly.
> Seems to me that pages put into the RCLM_FALLBACK area would have
> gone to the global free list and available to anyone.
Not quite, they would have gone to the pool they were first reserved as.
This could mean that the USERRCLM pool could end up with a lot of free
pages that kernel allocations then fallback to. This would make a mess of
the whole strategy.
> I must be missing
> something here.
>
> > +int fallback_allocs[RCLM_TYPES][RCLM_TYPES+1] = {
> > + {RCLM_NORCLM, RCLM_FALLBACK, RCLM_KERN, RCLM_USER, RCLM_TYPES},
> > + {RCLM_KERN, RCLM_FALLBACK, RCLM_NORCLM, RCLM_USER, RCLM_TYPES},
> > + {RCLM_USER, RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_TYPES},
> > + {RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_USER, RCLM_TYPES}
>
> Do you really need that last line? Can an allocation of type RCLM_FALLBACK
> realy be made?
>
In reality, no and it would only happen if a caller had specified both
__GFP_USER and __GFP_KERNRCLM in the call to alloc_pages() or friends. It
makes *no* sense for someone to do this, but if they did, an oops would be
thrown during an interrupt. The alternative is to get rid of this last
element and put a BUG_ON() check before the spinlock is taken.
This way, a stupid caller will damage the fragmentation strategy (which is
bad). The alternative, the kernel will call BUG() (which is bad). The
question is, which is worse?
--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [Lhms-devel] Re: [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback
2005-10-12 17:21 ` Mel Gorman
@ 2005-10-12 17:29 ` Joel Schopp
2005-10-14 5:12 ` Dave Hansen
0 siblings, 1 reply; 27+ messages in thread
From: Joel Schopp @ 2005-10-12 17:29 UTC (permalink / raw)
To: Mel Gorman; +Cc: mike kravetz, akpm, linux-kernel, linux-mm, lhms-devel
> In reality, no and it would only happen if a caller had specified both
> __GFP_USER and __GFP_KERNRCLM in the call to alloc_pages() or friends. It
> makes *no* sense for someone to do this, but if they did, an oops would be
> thrown during an interrupt. The alternative is to get rid of this last
> element and put a BUG_ON() check before the spinlock is taken.
>
> This way, a stupid caller will damage the fragmentation strategy (which is
> bad). The alternative, the kernel will call BUG() (which is bad). The
> question is, which is worse?
>
If in the future we hypothetically have code that damages the fragmentation
strategy we want to find it sooner rather than never. I'd rather some kernels
BUG() than we have bugs which go unnoticed.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [Lhms-devel] Re: [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback
2005-10-12 17:29 ` [Lhms-devel] " Joel Schopp
@ 2005-10-14 5:12 ` Dave Hansen
0 siblings, 0 replies; 27+ messages in thread
From: Dave Hansen @ 2005-10-14 5:12 UTC (permalink / raw)
To: Joel Schopp
Cc: Mel Gorman, Mike Kravetz, Andrew Morton,
Linux Kernel Mailing List, linux-mm, lhms
On Wed, 2005-10-12 at 12:29 -0500, Joel Schopp wrote:
> > In reality, no and it would only happen if a caller had specified both
> > __GFP_USER and __GFP_KERNRCLM in the call to alloc_pages() or friends. It
> > makes *no* sense for someone to do this, but if they did, an oops would be
> > thrown during an interrupt. The alternative is to get rid of this last
> > element and put a BUG_ON() check before the spinlock is taken.
> >
> > This way, a stupid caller will damage the fragmentation strategy (which is
> > bad). The alternative, the kernel will call BUG() (which is bad). The
> > question is, which is worse?
> >
>
> If in the future we hypothetically have code that damages the fragmentation
> strategy we want to find it sooner rather than never. I'd rather some kernels
> BUG() than we have bugs which go unnoticed.
It isn't a bug. It's a normal
let-the-stupid-user-shoot-themselves-in-the-foot situation. Let's
explicitly document the fact that you can't pass both flags, then maybe
add a WARN_ON() or another printk. Or, we just fail the allocation.
Failing the allocation seems like the simplest and most effective
solution. A developer will run into it when they're developing, it
won't be killing off processes or locking things up like a BUG(), and it
doesn't ruin any of the fragmentation strategy. It also fits with the
current behavior if someone asks the allocator do do something silly
like give them memory from a non-present zone.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 6/8] Fragmentation Avoidance V17: 006_largealloc_tryharder
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
` (4 preceding siblings ...)
2005-10-11 15:12 ` [PATCH 5/8] Fragmentation Avoidance V17: 005_fallback Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-13 19:07 ` Joel Schopp
2005-10-11 15:12 ` [PATCH 7/8] Fragmentation Avoidance V17: 007_percpu Mel Gorman
` (2 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
Fragmentation avoidance patches increase our chances of satisfying high
order allocations. So this patch takes more than one iteration at trying
to fulfill those allocations because, unlike before, the extra iterations
are often useful.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Joel Schopp <jschopp@austin.ibm.com>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-005_fallback/mm/page_alloc.c linux-2.6.14-rc3-006_largealloc_tryharder/mm/page_alloc.c
--- linux-2.6.14-rc3-005_fallback/mm/page_alloc.c 2005-10-11 12:09:27.000000000 +0100
+++ linux-2.6.14-rc3-006_largealloc_tryharder/mm/page_alloc.c 2005-10-11 12:11:31.000000000 +0100
@@ -1055,6 +1055,7 @@ __alloc_pages(unsigned int __nocast gfp_
int do_retry;
int can_try_harder;
int did_some_progress;
+ int highorder_retry = 3;
might_sleep_if(wait);
@@ -1203,8 +1204,19 @@ rebalance:
goto got_pg;
}
- out_of_memory(gfp_mask, order);
+ if (order < MAX_ORDER / 2)
+ out_of_memory(gfp_mask, order);
+
+ /*
+ * Due to low fragmentation efforts, we try a little
+ * harder to satisfy high order allocations and only
+ * go OOM for low-order allocations
+ */
+ if (order >= MAX_ORDER/2 && --highorder_retry > 0)
+ goto rebalance;
+
goto restart;
+
}
/*
@@ -1220,6 +1232,8 @@ rebalance:
do_retry = 1;
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
+ if (order >= MAX_ORDER/2 && --highorder_retry > 0)
+ do_retry = 1;
}
if (do_retry) {
blk_congestion_wait(WRITE, HZ/50);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 6/8] Fragmentation Avoidance V17: 006_largealloc_tryharder
2005-10-11 15:12 ` [PATCH 6/8] Fragmentation Avoidance V17: 006_largealloc_tryharder Mel Gorman
@ 2005-10-13 19:07 ` Joel Schopp
2005-10-14 5:36 ` Dave Hansen
0 siblings, 1 reply; 27+ messages in thread
From: Joel Schopp @ 2005-10-13 19:07 UTC (permalink / raw)
To: Mel Gorman; +Cc: akpm, kravetz, linux-kernel, linux-mm, lhms-devel
This is version 17, plus several versions I did while Mel was preoccupied with
his day job, makes well over 20 times this has been posted to the mailing lists
that are lkml, linux-mm, and memory hotplug.
All objections/feedback/suggestions that have been brought up on the lists are
fixed in the following version. It's starting to become almost silent when a
new version gets posted, possibly because everybody accepts the code as perfect,
possibly because they have grown bored with it. Probably a combination of both.
I'm guessing the reason this code hasn't been merged yet is because nobody has
really enumerated the benefits in awhile. Here's my try at it
Benefits of merging:
1. Reduced Fragmentation
2. Better able to fulfill large allocations (see 1)
3. Less out of memory conditions (see 1)
3. Prereq for memory hotplug remove
4. Would be helpful for future development of active defragmentation
5. Also helpful for future development of demand fault allocating large pages
Downsides of merging:
It's been well tested on multiple architectures in multiple configurations, but
non-trivial changes to core subsystems should not be done lightly.
> @@ -1203,8 +1204,19 @@ rebalance:
> goto got_pg;
> }
>
> - out_of_memory(gfp_mask, order);
> + if (order < MAX_ORDER / 2)
> + out_of_memory(gfp_mask, order);
> +
> + /*
> + * Due to low fragmentation efforts, we try a little
> + * harder to satisfy high order allocations and only
> + * go OOM for low-order allocations
> + */
> + if (order >= MAX_ORDER/2 && --highorder_retry > 0)
> + goto rebalance;
> +
> goto restart;
> +
> }
If order >= MAX_ORDER/2 it doesn't call out_of_memory(). The logic behind it is
that we shouldn't go OOM for large-order allocations, because we aren't really
OOM. And if we can't satisfy these large allocations then killing processes
should have little chance of helping. Mel and I had discussed this privately,
agreed, and it is reflected in the comment.
But it's a bit of a behavior change and I didn't want it to go unnoticed. I
guess the question is, should existing behavior of going OOM even for large
order allocations be maintained? Or is this change a better way, especially in
light of the lower fragmentation and increased attempts, like we think it is?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 6/8] Fragmentation Avoidance V17: 006_largealloc_tryharder
2005-10-13 19:07 ` Joel Schopp
@ 2005-10-14 5:36 ` Dave Hansen
0 siblings, 0 replies; 27+ messages in thread
From: Dave Hansen @ 2005-10-14 5:36 UTC (permalink / raw)
To: Joel Schopp
Cc: Mel Gorman, Andrew Morton, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Thu, 2005-10-13 at 14:07 -0500, Joel Schopp wrote:
> This is version 17, plus several versions I did while Mel was preoccupied with
> his day job, makes well over 20 times this has been posted to the mailing lists
> that are lkml, linux-mm, and memory hotplug.
>
> All objections/feedback/suggestions that have been brought up on the lists are
> fixed in the following version. It's starting to become almost silent when a
> new version gets posted, possibly because everybody accepts the code as perfect,
> possibly because they have grown bored with it. Probably a combination of both.
I don't think it's shown signs of stabilizing quite yet. Each revision
has new code that needs to be cleaned up, and new obvious CodingStyle
issues. Let's see a couple of incrementally cleaner releases go by,
which don't have renewed old issues, and then we can think about asking
to get it merged elsewhere.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 7/8] Fragmentation Avoidance V17: 007_percpu
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
` (5 preceding siblings ...)
2005-10-11 15:12 ` [PATCH 6/8] Fragmentation Avoidance V17: 006_largealloc_tryharder Mel Gorman
@ 2005-10-11 15:12 ` Mel Gorman
2005-10-11 15:13 ` [PATCH 8/8] Fragmentation Avoidance V17: 008_stats Mel Gorman
2005-10-16 2:52 ` [PATCH 0/8] Fragmentation Avoidance V17 Paul Jackson
8 siblings, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:12 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
The freelists for each allocation type can slowly become corrupted due to
the per-cpu list. Consider what happens when the following happens
1. A 2^(MAX_ORDER-1) list is reserved for __GFP_USER pages
2. An order-0 page is allocated from the newly reserved block
3. The page is freed and placed on the per-cpu list
4. alloc_page() is called with GFP_KERNEL as the gfp_mask
5. The per-cpu list is used to satisfy the allocation
Now, a kernel page is in the middle of a __GFP_USER page. This means that over
long periods of the time, the anti-fragmentation scheme slowly degrades to the
standard allocator.
This patch divides the per-cpu lists into Kernel and User lists. RCLM_NORCLM
and RCLM_KERN use the Kernel list and RCLM_USER uses the user list. Strictly
speaking, there should be three lists but as little effort is made to reclaim
RCLM_KERN pages, it is not worth the overhead *yet*.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-006_largealloc_tryharder/include/linux/mmzone.h linux-2.6.14-rc3-007_percpu/include/linux/mmzone.h
--- linux-2.6.14-rc3-006_largealloc_tryharder/include/linux/mmzone.h 2005-10-11 12:09:27.000000000 +0100
+++ linux-2.6.14-rc3-007_percpu/include/linux/mmzone.h 2005-10-11 12:12:12.000000000 +0100
@@ -59,12 +59,17 @@ struct zone_padding {
#define ZONE_PADDING(name)
#endif
+/* Indices into pcpu_list */
+#define PCPU_KERNEL 0
+#define PCPU_USER 1
+#define PCPU_TYPES 2
+
struct per_cpu_pages {
- int count; /* number of pages in the list */
+ int count[PCPU_TYPES]; /* Number of pages on each list */
int low; /* low watermark, refill needed */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
- struct list_head list; /* the list of pages */
+ struct list_head list[PCPU_TYPES]; /* the lists of pages */
};
struct per_cpu_pageset {
@@ -79,6 +84,10 @@ struct per_cpu_pageset {
#endif
} ____cacheline_aligned_in_smp;
+/* Helpers for per_cpu_pages */
+#define pset_count(pset) (pset.count[PCPU_KERNEL] + pset.count[PCPU_USER])
+#define for_each_pcputype(pindex) \
+ for (pindex=0; pindex < PCPU_TYPES; pindex++)
#ifdef CONFIG_NUMA
#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
#else
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-006_largealloc_tryharder/mm/page_alloc.c linux-2.6.14-rc3-007_percpu/mm/page_alloc.c
--- linux-2.6.14-rc3-006_largealloc_tryharder/mm/page_alloc.c 2005-10-11 12:11:31.000000000 +0100
+++ linux-2.6.14-rc3-007_percpu/mm/page_alloc.c 2005-10-11 12:12:12.000000000 +0100
@@ -783,7 +783,7 @@ static int rmqueue_bulk(struct zone *zon
void drain_remote_pages(void)
{
struct zone *zone;
- int i;
+ int i, pindex;
unsigned long flags;
local_irq_save(flags);
@@ -799,9 +799,16 @@ void drain_remote_pages(void)
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
- if (pcp->count)
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ for_each_pcputype(pindex) {
+ if (!pcp->count[pindex])
+ continue;
+
+ /* Try remove all pages from the pcpu list */
+ pcp->count[pindex] -=
+ free_pages_bulk(zone,
+ pcp->count[pindex],
+ &pcp->list[pindex], 0);
+ }
}
}
local_irq_restore(flags);
@@ -812,7 +819,7 @@ void drain_remote_pages(void)
static void __drain_pages(unsigned int cpu)
{
struct zone *zone;
- int i;
+ int i, pindex;
for_each_zone(zone) {
struct per_cpu_pageset *pset;
@@ -822,8 +829,16 @@ static void __drain_pages(unsigned int c
struct per_cpu_pages *pcp;
pcp = &pset->pcp[i];
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
+ for_each_pcputype(pindex) {
+ if (!pcp->count[pindex])
+ continue;
+
+ /* Try remove all pages from the pcpu list */
+ pcp->count[pindex] -=
+ free_pages_bulk(zone,
+ pcp->count[pindex],
+ &pcp->list[pindex], 0);
+ }
}
}
}
@@ -902,6 +917,7 @@ static void fastcall free_hot_cold_page(
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
+ int pindex;
arch_free_page(page, 0);
@@ -911,11 +927,21 @@ static void fastcall free_hot_cold_page(
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+
+ /*
+ * Strictly speaking, we should not be accessing the zone information
+ * here. In this case, it does not matter if the read is incorrect
+ */
+ if (get_pageblock_type(zone, page) == RCLM_USER)
+ pindex = PCPU_USER;
+ else
+ pindex = PCPU_KERNEL;
local_irq_save(flags);
- list_add(&page->lru, &pcp->list);
- pcp->count++;
- if (pcp->count >= pcp->high)
- pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ list_add(&page->lru, &pcp->list[pindex]);
+ pcp->count[pindex]++;
+ if (pcp->count[pindex] >= pcp->high)
+ pcp->count[pindex] -= free_pages_bulk(zone, pcp->batch,
+ &pcp->list[pindex], 0);
local_irq_restore(flags);
put_cpu();
}
@@ -954,18 +980,25 @@ buffered_rmqueue(struct zone *zone, int
if (order == 0) {
struct per_cpu_pages *pcp;
-
+ int pindex = PCPU_KERNEL;
+ if (alloctype == RCLM_USER)
+ pindex = PCPU_USER;
+
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count <= pcp->low)
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list,
- alloctype);
- if (pcp->count) {
- page = list_entry(pcp->list.next, struct page, lru);
+ if (pcp->count[pindex] <= pcp->low)
+ pcp->count[pindex] += rmqueue_bulk(zone,
+ 0, pcp->batch,
+ &(pcp->list[pindex]),
+ alloctype);
+
+ if (pcp->count[pindex]) {
+ page = list_entry(pcp->list[pindex].next,
+ struct page, lru);
list_del(&page->lru);
- pcp->count--;
+ pcp->count[pindex]--;
}
+
local_irq_restore(flags);
put_cpu();
}
@@ -1603,7 +1636,7 @@ void show_free_areas(void)
pageset->pcp[temperature].low,
pageset->pcp[temperature].high,
pageset->pcp[temperature].batch,
- pageset->pcp[temperature].count);
+ pset_count(pageset->pcp[temperature]));
}
}
@@ -2055,18 +2088,22 @@ inline void setup_pageset(struct per_cpu
struct per_cpu_pages *pcp;
pcp = &p->pcp[0]; /* hot */
- pcp->count = 0;
+ pcp->count[PCPU_KERNEL] = 0;
+ pcp->count[PCPU_USER] = 0;
pcp->low = 2 * batch;
- pcp->high = 6 * batch;
+ pcp->high = 3 * batch;
pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
+ INIT_LIST_HEAD(&pcp->list[PCPU_KERNEL]);
+ INIT_LIST_HEAD(&pcp->list[PCPU_USER]);
pcp = &p->pcp[1]; /* cold*/
- pcp->count = 0;
+ pcp->count[PCPU_KERNEL] = 0;
+ pcp->count[PCPU_USER] = 0;
pcp->low = 0;
- pcp->high = 2 * batch;
+ pcp->high = batch;
pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
+ INIT_LIST_HEAD(&pcp->list[PCPU_KERNEL]);
+ INIT_LIST_HEAD(&pcp->list[PCPU_USER]);
}
#ifdef CONFIG_NUMA
@@ -2476,7 +2513,7 @@ static int zoneinfo_show(struct seq_file
pageset = zone_pcp(zone, i);
for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
- if (pageset->pcp[j].count)
+ if (pset_count(pageset->pcp[j]))
break;
}
if (j == ARRAY_SIZE(pageset->pcp))
@@ -2489,7 +2526,7 @@ static int zoneinfo_show(struct seq_file
"\n high: %i"
"\n batch: %i",
i, j,
- pageset->pcp[j].count,
+ pset_count(pageset->pcp[j]),
pageset->pcp[j].low,
pageset->pcp[j].high,
pageset->pcp[j].batch);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [PATCH 8/8] Fragmentation Avoidance V17: 008_stats
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
` (6 preceding siblings ...)
2005-10-11 15:12 ` [PATCH 7/8] Fragmentation Avoidance V17: 007_percpu Mel Gorman
@ 2005-10-11 15:13 ` Mel Gorman
2005-10-12 11:57 ` [Lhms-devel] " Dave Hansen
2005-10-16 2:52 ` [PATCH 0/8] Fragmentation Avoidance V17 Paul Jackson
8 siblings, 1 reply; 27+ messages in thread
From: Mel Gorman @ 2005-10-11 15:13 UTC (permalink / raw)
To: akpm; +Cc: jschopp, Mel Gorman, kravetz, linux-kernel, linux-mm, lhms-devel
It is not necessary to apply this patch to get all the anti-fragmentation
code. This patch adds a new config option called CONFIG_ALLOCSTATS. If
set, a number of new bean counters are added that are related to the
anti-fragmentation code. The information is exported via /proc/buddyinfo. This
is very useful when debugging why high-order pages are not available for
allocation.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-007_percpu/include/linux/mmzone.h linux-2.6.14-rc3-008_stats/include/linux/mmzone.h
--- linux-2.6.14-rc3-007_percpu/include/linux/mmzone.h 2005-10-11 12:12:12.000000000 +0100
+++ linux-2.6.14-rc3-008_stats/include/linux/mmzone.h 2005-10-11 12:12:57.000000000 +0100
@@ -180,6 +180,19 @@ struct zone {
unsigned long fallback_reserve;
long fallback_balance;
+#ifdef CONFIG_ALLOCSTATS
+ /*
+ * These are beancounters that track how the placement policy
+ * of the buddy allocator is performing
+ */
+ unsigned long fallback_count[RCLM_TYPES];
+ unsigned long alloc_count[RCLM_TYPES];
+ unsigned long reserve_count[RCLM_TYPES];
+ unsigned long kernnorclm_full_steal;
+ unsigned long kernnorclm_partial_steal;
+#endif
+
+
ZONE_PADDING(_pad1_)
/* Fields commonly accessed by the page reclaim scanner */
@@ -269,6 +282,17 @@ struct zone {
char *name;
} ____cacheline_maxaligned_in_smp;
+#ifdef CONFIG_ALLOCSTATS
+#define inc_fallback_count(zone, type) zone->fallback_count[type]++
+#define inc_alloc_count(zone, type) zone->alloc_count[type]++
+#define inc_kernnorclm_partial_steal(zone) zone->kernnorclm_partial_steal++
+#define inc_kernnorclm_full_steal(zone) zone->kernnorclm_full_steal++
+#else
+#define inc_fallback_count(zone, type) do {} while (0)
+#define inc_alloc_count(zone, type) do {} while (0)
+#define inc_kernnorclm_partial_steal(zone) do {} while (0)
+#define inc_kernnorclm_full_steal(zone) do {} while (0)
+#endif
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
@@ -296,12 +320,19 @@ static inline void inc_reserve_count(str
{
if (type == RCLM_FALLBACK)
zone->fallback_reserve++;
+#ifdef CONFIG_ALLOCSTATS
+ zone->reserve_count[type]++;
+#endif
}
static inline void dec_reserve_count(struct zone* zone, int type)
{
if (type == RCLM_FALLBACK && zone->fallback_reserve)
zone->fallback_reserve--;
+#ifdef CONFIG_ALLOCSTATS
+ if (zone->reserve_count[type] > 0)
+ zone->reserve_count[type]--;
+#endif
}
/*
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-007_percpu/lib/Kconfig.debug linux-2.6.14-rc3-008_stats/lib/Kconfig.debug
--- linux-2.6.14-rc3-007_percpu/lib/Kconfig.debug 2005-10-04 22:58:34.000000000 +0100
+++ linux-2.6.14-rc3-008_stats/lib/Kconfig.debug 2005-10-11 12:12:57.000000000 +0100
@@ -77,6 +77,17 @@ config SCHEDSTATS
application, you can say N to avoid the very slight overhead
this adds.
+config ALLOCSTATS
+ bool "Collection buddy allocator statistics"
+ depends on DEBUG_KERNEL && PROC_FS
+ help
+ If you say Y here, additional code will be inserted into the
+ page allocator routines to collect statistics on the allocator
+ behavior and provide them in /proc/buddyinfo. These stats are
+ useful for measuring fragmentation in the buddy allocator. If
+ you are not debugging or measuring the allocator, you can say N
+ to avoid the slight overhead this adds.
+
config DEBUG_SLAB
bool "Debug memory allocations"
depends on DEBUG_KERNEL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.14-rc3-007_percpu/mm/page_alloc.c linux-2.6.14-rc3-008_stats/mm/page_alloc.c
--- linux-2.6.14-rc3-007_percpu/mm/page_alloc.c 2005-10-11 12:12:12.000000000 +0100
+++ linux-2.6.14-rc3-008_stats/mm/page_alloc.c 2005-10-11 12:12:57.000000000 +0100
@@ -191,6 +191,11 @@ EXPORT_SYMBOL(zone_table);
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
int min_free_kbytes = 1024;
+#ifdef CONFIG_ALLOCSTATS
+static char *type_names[RCLM_TYPES] = { "KernNoRclm", "UserRclm",
+ "KernRclm", "Fallback"};
+#endif /* CONFIG_ALLOCSTATS */
+
unsigned long __initdata nr_kernel_pages;
unsigned long __initdata nr_all_pages;
@@ -675,6 +680,9 @@ fallback_buddy_reserve(int start_allocty
set_pageblock_type(zone, page, (unsigned long)reserve_type);
inc_reserve_count(zone, reserve_type);
+ inc_kernnorclm_full_steal(zone);
+ } else {
+ inc_kernnorclm_partial_steal(zone);
}
return area;
}
@@ -717,6 +725,15 @@ fallback_alloc(int alloctype, struct zon
current_order, area);
}
+
+ /*
+ * If the current alloctype is RCLM_FALLBACK, it means
+ * that the requested pool and fallback pool are both
+ * depleted and we are falling back to other pools.
+ * At this point, pools are starting to get fragmented
+ */
+ if (alloctype == RCLM_FALLBACK)
+ inc_fallback_count(zone, start_alloctype);
}
return NULL;
@@ -733,6 +750,8 @@ static struct page *__rmqueue(struct zon
unsigned int current_order;
struct page *page;
+ inc_alloc_count(zone, alloctype);
+
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area_lists[alloctype][current_order]);
if (list_empty(&area->free_list))
@@ -2335,6 +2354,20 @@ static void __init free_area_init_core(s
zone_start_pfn += size;
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+#ifdef CONFIG_ALLOCSTAT
+ memset((unsigned long *)zone->fallback_count, 0,
+ sizeof(zone->fallback_count));
+ memset((unsigned long *)zone->alloc_count, 0,
+ sizeof(zone->alloc_count));
+ memset((unsigned long *)zone->alloc_count, 0,
+ sizeof(zone->alloc_count));
+ zone->kernnorclm_partial_steal=0;
+ zone->kernnorclm_full_steal=0;
+ zone->reserve_count[RCLM_NORCLM] =
+ realsize >> (MAX_ORDER-1);
+#endif
+
}
}
@@ -2431,6 +2464,18 @@ static int frag_show(struct seq_file *m,
int order, type;
struct free_area *area;
unsigned long nr_bufs = 0;
+#ifdef CONFIG_ALLOCSTATS
+ int i;
+ unsigned long kernnorclm_full_steal=0;
+ unsigned long kernnorclm_partial_steal=0;
+ unsigned long reserve_count[RCLM_TYPES];
+ unsigned long fallback_count[RCLM_TYPES];
+ unsigned long alloc_count[RCLM_TYPES];
+
+ memset(reserve_count, 0, sizeof(reserve_count));
+ memset(fallback_count, 0, sizeof(fallback_count));
+ memset(alloc_count, 0, sizeof(alloc_count));
+#endif
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!zone->present_pages)
@@ -2451,6 +2496,79 @@ static int frag_show(struct seq_file *m,
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
+
+#ifdef CONFIG_ALLOCSTATS
+ /* Show statistics for each allocation type */
+ seq_printf(m, "\nPer-allocation-type statistics");
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!zone->present_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (type=0; type < RCLM_TYPES; type++) {
+ struct list_head *elem;
+ seq_printf(m, "\nNode %d, zone %8s, type %10s ",
+ pgdat->node_id, zone->name,
+ type_names[type]);
+ for (order = 0; order < MAX_ORDER; ++order) {
+ nr_bufs = 0;
+
+ list_for_each(elem, &(zone->free_area_lists[type][order].free_list))
+ ++nr_bufs;
+ seq_printf(m, "%6lu ", nr_bufs);
+ }
+ }
+
+ /* Scan global list */
+ seq_printf(m, "\n");
+ seq_printf(m, "Node %d, zone %8s, type %10s",
+ pgdat->node_id, zone->name,
+ "MAX_ORDER");
+ nr_bufs = 0;
+ for (type=0; type < RCLM_TYPES; type++) {
+ nr_bufs += zone->free_area_lists[type][MAX_ORDER-1].nr_free;
+ }
+ seq_printf(m, "%6lu ", nr_bufs);
+ seq_printf(m, "\n");
+
+ seq_printf(m, "%s Zone beancounters\n", zone->name);
+ seq_printf(m, "Fallback balance: %d\n", (int)zone->fallback_balance);
+ seq_printf(m, "Fallback reserve: %d\n", (int)zone->fallback_reserve);
+ seq_printf(m, "Partial steal: %lu\n", zone->kernnorclm_partial_steal);
+ seq_printf(m, "Full steal: %lu\n", zone->kernnorclm_full_steal);
+
+ kernnorclm_partial_steal += zone->kernnorclm_partial_steal;
+ kernnorclm_full_steal += zone->kernnorclm_full_steal;
+ seq_putc(m, '\n');
+
+ for (i=0; i< RCLM_TYPES; i++) {
+ seq_printf(m, "%-10s Allocs: %-10lu Reserve: %-10lu Fallbacks: %-10lu\n",
+ type_names[i],
+ zone->alloc_count[i],
+ zone->reserve_count[i],
+ zone->fallback_count[i]);
+ alloc_count[i] += zone->alloc_count[i];
+ reserve_count[i] += zone->reserve_count[i];
+ fallback_count[i] += zone->fallback_count[i];
+ }
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+
+ /* Show bean counters */
+ seq_printf(m, "\nGlobal beancounters\n");
+ seq_printf(m, "Partial steal: %lu\n", kernnorclm_partial_steal);
+ seq_printf(m, "Full steal: %lu\n", kernnorclm_full_steal);
+
+ for (i=0; i< RCLM_TYPES; i++) {
+ seq_printf(m, "%-10s Allocs: %-10lu Reserve: %-10lu Fallbacks: %-10lu\n",
+ type_names[i],
+ alloc_count[i],
+ reserve_count[i],
+ fallback_count[i]);
+ }
+#endif /* CONFIG_ALLOCSTATS */
return 0;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [Lhms-devel] [PATCH 8/8] Fragmentation Avoidance V17: 008_stats
2005-10-11 15:13 ` [PATCH 8/8] Fragmentation Avoidance V17: 008_stats Mel Gorman
@ 2005-10-12 11:57 ` Dave Hansen
2005-10-12 12:19 ` Mel Gorman
0 siblings, 1 reply; 27+ messages in thread
From: Dave Hansen @ 2005-10-12 11:57 UTC (permalink / raw)
To: Mel Gorman
Cc: Andrew Morton, jschopp, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Tue, 2005-10-11 at 16:13 +0100, Mel Gorman wrote:
> +#ifdef CONFIG_ALLOCSTAT
> + memset((unsigned long *)zone->fallback_count, 0,
> + sizeof(zone->fallback_count));
> + memset((unsigned long *)zone->alloc_count, 0,
> + sizeof(zone->alloc_count));
> + memset((unsigned long *)zone->alloc_count, 0,
> + sizeof(zone->alloc_count));
> + zone->kernnorclm_partial_steal=0;
> + zone->kernnorclm_full_steal=0;
> + zone->reserve_count[RCLM_NORCLM] =
> + realsize >> (MAX_ORDER-1);
> +#endif
The struct zone is part of the pgdat which is zeroed at boot-time on all
architectures and configuration that I have ever audited. Re-zeroing
parts of it here is unnecessary.
BTW, that '=0' with no spaces is anti-CodingStyle.
-- Dave
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [Lhms-devel] [PATCH 8/8] Fragmentation Avoidance V17: 008_stats
2005-10-12 11:57 ` [Lhms-devel] " Dave Hansen
@ 2005-10-12 12:19 ` Mel Gorman
0 siblings, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-12 12:19 UTC (permalink / raw)
To: Dave Hansen
Cc: Andrew Morton, jschopp, kravetz, Linux Kernel Mailing List,
linux-mm, lhms
On Wed, 12 Oct 2005, Dave Hansen wrote:
> On Tue, 2005-10-11 at 16:13 +0100, Mel Gorman wrote:
> > +#ifdef CONFIG_ALLOCSTAT
> > + memset((unsigned long *)zone->fallback_count, 0,
> > + sizeof(zone->fallback_count));
> > + memset((unsigned long *)zone->alloc_count, 0,
> > + sizeof(zone->alloc_count));
> > + memset((unsigned long *)zone->alloc_count, 0,
> > + sizeof(zone->alloc_count));
> > + zone->kernnorclm_partial_steal=0;
> > + zone->kernnorclm_full_steal=0;
> > + zone->reserve_count[RCLM_NORCLM] =
> > + realsize >> (MAX_ORDER-1);
> > +#endif
>
> The struct zone is part of the pgdat which is zeroed at boot-time on all
> architectures and configuration that I have ever audited. Re-zeroing
> parts of it here is unnecessary.
>
> BTW, that '=0' with no spaces is anti-CodingStyle.
>
Blast, true. However, the whole block of code can be simply removed which
I prefer. I didn't like the #ifdef in the middle of the function.
--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 0/8] Fragmentation Avoidance V17
2005-10-11 15:12 [PATCH 0/8] Fragmentation Avoidance V17 Mel Gorman
` (7 preceding siblings ...)
2005-10-11 15:13 ` [PATCH 8/8] Fragmentation Avoidance V17: 008_stats Mel Gorman
@ 2005-10-16 2:52 ` Paul Jackson
2005-10-16 11:59 ` Mel Gorman
8 siblings, 1 reply; 27+ messages in thread
From: Paul Jackson @ 2005-10-16 2:52 UTC (permalink / raw)
To: Mel Gorman; +Cc: akpm, jschopp, kravetz, linux-kernel, linux-mm, lhms-devel
Mel wrote:
> +#define __GFP_USER 0x80000u /* User and other easily reclaimed pages */
> +#define __GFP_KERNRCLM 0x100000u /* Kernel page that is reclaimable */
Sorry, but that __GFP_USER name is still sticking in my craw.
I won't try to reopen my quest to get it named __GFP_REALLY_REALLY_EASY_RCLM
or whatever it was, but instead will venture on a new quest.
Can we get the 'RCLM' in there. Especially since this term appears
naked in such code as:
> - page = alloc_page(GFP_HIGHUSER);
> + page = alloc_page(GFP_HIGHUSER|__GFP_USER);
where it is not at all obvious to the reader of this file (fs/exec.c)
that the __GFP_USER term is commenting on the reclaim behaviour of
the page to be allocated.
I'd be happier with:
> +#define __GFP_USERRCLM 0x80000u /* User and other easily reclaimed pages */
> +#define __GFP_KERNRCLM 0x100000u /* Kernel page that is reclaimable */
and:
> - page = alloc_page(GFP_HIGHUSER);
> + page = alloc_page(GFP_HIGHUSER|__GFP_USERRCLM);
Also the bold assymetry of these two #defines seems to be without motivation,
one with the 'RCLM', and the other with ' ' four spaces.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 0/8] Fragmentation Avoidance V17
2005-10-16 2:52 ` [PATCH 0/8] Fragmentation Avoidance V17 Paul Jackson
@ 2005-10-16 11:59 ` Mel Gorman
2005-10-16 17:53 ` Paul Jackson
0 siblings, 1 reply; 27+ messages in thread
From: Mel Gorman @ 2005-10-16 11:59 UTC (permalink / raw)
To: Paul Jackson
Cc: akpm, jschopp, kravetz, Linux Kernel Mailing List,
Linux Memory Management List, lhms-devel
On Sat, 15 Oct 2005, Paul Jackson wrote:
> Mel wrote:
> > +#define __GFP_USER 0x80000u /* User and other easily reclaimed pages */
> > +#define __GFP_KERNRCLM 0x100000u /* Kernel page that is reclaimable */
>
> Sorry, but that __GFP_USER name is still sticking in my craw.
>
> I won't try to reopen my quest to get it named __GFP_REALLY_REALLY_EASY_RCLM
> or whatever it was, but instead will venture on a new quest.
>
> Can we get the 'RCLM' in there. Especially since this term appears
> naked in such code as:
>
__GFP_USERRCLM is the original name and the motivation for changing it to
__GFP_USER no longer exists. However, __GFP_EASYRCLM would also make sense
as it flags pages that are (surprise surprise) easily reclaimed. Would
__GFP_EASYRCLM be the best choice? __GFP_USERRCLM may imply to some
readers that it is reclaimed by the user somehow. The flags would then be;
__GFP_EASYRCLM - Allocations for pages that are easily reclaimed such as
userspace and buffer pages
__GFP_KERNRCLM - Allocations for pages that may be reclaimed by the kernel
such as caches
No flag - Page cannot be easily reclaimed by any mechanism.
I would be happy with __GFP_USERRCLM but __GFP_EASYRCLM may be more
obvious?
> > - page = alloc_page(GFP_HIGHUSER);
> > + page = alloc_page(GFP_HIGHUSER|__GFP_USER);
>
> where it is not at all obvious to the reader of this file (fs/exec.c)
> that the __GFP_USER term is commenting on the reclaim behaviour of
> the page to be allocated.
>
> I'd be happier with:
>
> > +#define __GFP_USERRCLM 0x80000u /* User and other easily reclaimed pages */
> > +#define __GFP_KERNRCLM 0x100000u /* Kernel page that is reclaimable */
>
> and:
>
> > - page = alloc_page(GFP_HIGHUSER);
> > + page = alloc_page(GFP_HIGHUSER|__GFP_USERRCLM);
>
> Also the bold assymetry of these two #defines seems to be without motivation,
> one with the 'RCLM', and the other with ' ' four spaces.
>
>
--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 0/8] Fragmentation Avoidance V17
2005-10-16 11:59 ` Mel Gorman
@ 2005-10-16 17:53 ` Paul Jackson
2005-10-16 18:03 ` Mel Gorman
0 siblings, 1 reply; 27+ messages in thread
From: Paul Jackson @ 2005-10-16 17:53 UTC (permalink / raw)
To: Mel Gorman; +Cc: akpm, jschopp, kravetz, linux-kernel, linux-mm, lhms-devel
Mel wrote:
> I would be happy with __GFP_USERRCLM but __GFP_EASYRCLM may be more
> obvious?
I would be delighted with either one. Yes, __GFP_EASYRCLM is more obvious.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [PATCH 0/8] Fragmentation Avoidance V17
2005-10-16 17:53 ` Paul Jackson
@ 2005-10-16 18:03 ` Mel Gorman
0 siblings, 0 replies; 27+ messages in thread
From: Mel Gorman @ 2005-10-16 18:03 UTC (permalink / raw)
To: Paul Jackson
Cc: akpm, jschopp, kravetz, Linux Kernel Mailing List,
Linux Memory Management List, lhms-devel
On Sun, 16 Oct 2005, Paul Jackson wrote:
> Mel wrote:
> > I would be happy with __GFP_USERRCLM but __GFP_EASYRCLM may be more
> > obvious?
>
> I would be delighted with either one. Yes, __GFP_EASYRCLM is more obvious.
>
__GFP_EASYRCLM it is then unless someone has an objection. For
consistency, RCLM_USER will change to RCLM_EASY as well. Changing one and
not the other makes no sense to me.
>
--
Mel Gorman
Part-time Phd Student Java Applications Developer
University of Limerick IBM Dublin Software Lab
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread