From mboxrd@z Thu Jan 1 00:00:00 1970 Date: Sat, 15 Jan 2005 19:45:41 +0000 (GMT) From: Mel Gorman Subject: Re: [PATCH] 1/2 Reducing fragmentation through better allocation In-Reply-To: Message-ID: References: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: owner-linux-mm@kvack.org Return-Path: To: Linux Memory Management List Cc: Linux Kernel Mailing List List-ID: Changelog since V2 o Do not to interfere with the "min" decay o Update the __GFP_BITS_SHIFT properly. Old value broke fsync and probably a lot more besides This patch divides allocations into three different types of allocations; UserReclaimable - These are userspace pages that are easily reclaimable. Right now, I'm putting all allocations of GFP_USER and GFP_HIGHUSER as well as disk-buffer pages into this category. These pages are trivially reclaimed by writing the page out to swap or syncing with backing storage KernelReclaimable - These are pages allocated by the kernel that are easily reclaimed. This is stuff like inode caches, dcache, buffer_heads etc. These type of pages potentially could be reclaimed by dumping the caches and reaping the slabs (drastic, but you get the idea). KernelNonReclaimable - These are pages that are allocated by the kernel that are not trivially reclaimed. For example, the memory allocated for a loaded module would be in this category. By default, allocations are considered to be of this type Instead of having one global MAX_ORDER-sized array of free lists, there are three, one for each type of allocation. Finally, there is a list of pages of size 2^MAX_ORDER which is a global pool of the largest pages the kernel deals with. Once a 2^MAX_ORDER block of pages it split for a type of allocation, it is added to the free-lists for that type, in effect reserving it. Hence, over time, pages of the different types can be clustered together. This means that if we wanted 2^MAX_ORDER number of pages, we could linearly scan a block of pages allocated for UserReclaimable and page each of them out. Fallback is used when there are no 2^MAX_ORDER pages available and there are no free pages of the desired type. The fallback lists were chosen in a way that keeps the most easily reclaimable pages together. Signed-off-by: Mel Gorman diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/buffer.c linux-2.6.11-rc1-mbuddy/fs/buffer.c --- linux-2.6.11-rc1-clean/fs/buffer.c 2005-01-12 04:01:23.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/fs/buffer.c 2005-01-13 10:56:30.000000000 +0000 @@ -1134,7 +1134,8 @@ grow_dev_page(struct block_device *bdev, struct page *page; struct buffer_head *bh; - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, + GFP_NOFS | __GFP_USERRCLM); if (!page) return NULL; @@ -2997,7 +2998,8 @@ static void recalc_bh_state(void) struct buffer_head *alloc_buffer_head(int gfp_flags) { - struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, + gfp_flags|__GFP_KERNRCLM); if (ret) { preempt_disable(); __get_cpu_var(bh_accounting).nr++; diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/dcache.c linux-2.6.11-rc1-mbuddy/fs/dcache.c --- linux-2.6.11-rc1-clean/fs/dcache.c 2005-01-12 04:00:09.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/fs/dcache.c 2005-01-13 10:56:30.000000000 +0000 @@ -715,7 +715,8 @@ struct dentry *d_alloc(struct dentry * p struct dentry *dentry; char *dname; - dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + dentry = kmem_cache_alloc(dentry_cache, + GFP_KERNEL|__GFP_KERNRCLM); if (!dentry) return NULL; diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ext2/super.c linux-2.6.11-rc1-mbuddy/fs/ext2/super.c --- linux-2.6.11-rc1-clean/fs/ext2/super.c 2005-01-12 04:01:24.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/fs/ext2/super.c 2005-01-13 10:56:30.000000000 +0000 @@ -137,7 +137,7 @@ static kmem_cache_t * ext2_inode_cachep; static struct inode *ext2_alloc_inode(struct super_block *sb) { struct ext2_inode_info *ei; - ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL); + ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL|__GFP_KERNRCLM); if (!ei) return NULL; #ifdef CONFIG_EXT2_FS_POSIX_ACL diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ext3/super.c linux-2.6.11-rc1-mbuddy/fs/ext3/super.c --- linux-2.6.11-rc1-clean/fs/ext3/super.c 2005-01-12 04:02:11.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/fs/ext3/super.c 2005-01-13 10:56:30.000000000 +0000 @@ -434,7 +434,7 @@ static struct inode *ext3_alloc_inode(st { struct ext3_inode_info *ei; - ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS); + ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM); if (!ei) return NULL; #ifdef CONFIG_EXT3_FS_POSIX_ACL diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/fs/ntfs/inode.c linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c --- linux-2.6.11-rc1-clean/fs/ntfs/inode.c 2005-01-12 04:01:45.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/fs/ntfs/inode.c 2005-01-13 10:56:30.000000000 +0000 @@ -318,7 +318,7 @@ struct inode *ntfs_alloc_big_inode(struc ntfs_debug("Entering."); ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache, - SLAB_NOFS); + SLAB_NOFS|__GFP_KERNRCLM); if (likely(ni != NULL)) { ni->state = 0; return VFS_I(ni); @@ -343,7 +343,8 @@ static inline ntfs_inode *ntfs_alloc_ext ntfs_inode *ni; ntfs_debug("Entering."); - ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS); + ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, + SLAB_NOFS|__GFP_KERNRCLM); if (likely(ni != NULL)) { ni->state = 0; return ni; diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/include/linux/gfp.h linux-2.6.11-rc1-mbuddy/include/linux/gfp.h --- linux-2.6.11-rc1-clean/include/linux/gfp.h 2005-01-12 04:00:35.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/include/linux/gfp.h 2005-01-15 18:16:47.000000000 +0000 @@ -38,21 +38,24 @@ struct vm_area_struct; #define __GFP_NO_GROW 0x2000 /* Slab internal usage */ #define __GFP_COMP 0x4000 /* Add compound page metadata */ #define __GFP_ZERO 0x8000 /* Return zeroed page on success */ +#define __GFP_KERNRCLM 0x10000 /* Kernel page that is easily reclaimable */ +#define __GFP_USERRCLM 0x20000 /* User is a userspace user */ -#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 18 /* Room for 18 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ + __GFP_USERRCLM|__GFP_KERNRCLM) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM) +#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USERRCLM) +#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | __GFP_USERRCLM) /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some platforms, used as appropriate on others */ diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/include/linux/mmzone.h linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h --- linux-2.6.11-rc1-clean/include/linux/mmzone.h 2005-01-12 04:01:17.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/include/linux/mmzone.h 2005-01-13 14:24:27.000000000 +0000 @@ -19,6 +19,10 @@ #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif +#define ALLOC_TYPES 3 +#define ALLOC_KERNNORCLM 0 +#define ALLOC_KERNRCLM 1 +#define ALLOC_USERRCLM 2 struct free_area { struct list_head free_list; @@ -131,8 +135,37 @@ struct zone { * free areas of different sizes */ spinlock_t lock; - struct free_area free_area[MAX_ORDER]; + /* + * There are ALLOC_TYPE number of MAX_ORDER free lists. Once a + * MAX_ORDER block of pages has been split for an allocation type, + * the whole block is reserved for that type of allocation. The + * types are User Reclaimable, Kernel Reclaimable and Kernel + * Non-reclaimable. The objective is to reduce fragmentation + * overall + */ + struct free_area free_area_lists[ALLOC_TYPES][MAX_ORDER]; + + /* + * This is a list of page blocks of 2^MAX_ORDER. Once one of + * these are split, the buddy is added to the appropriate + * free_area_lists. When the buddies are later merged, they + * are placed back here + */ + struct free_area free_area_global; + + /* + * This map tracks what each 2^MAX_ORDER sized block has been used for. + * Each 2^MAX_ORDER block have pages has 2 bits in this map to remember + * what the block is for. When a page is freed, it's index within this + * bitmap is calculated using (address >> MAX_ORDER) * 2 . This means + * that pages will always be freed into the correct list in + * free_area_lists + * + * The bits are set when a 2^MAX_ORDER block of pages is split + */ + + unsigned long *free_area_usemap; ZONE_PADDING(_pad1_) diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.11-rc1-clean/mm/page_alloc.c linux-2.6.11-rc1-mbuddy/mm/page_alloc.c --- linux-2.6.11-rc1-clean/mm/page_alloc.c 2005-01-12 04:00:02.000000000 +0000 +++ linux-2.6.11-rc1-mbuddy/mm/page_alloc.c 2005-01-15 18:10:54.000000000 +0000 @@ -46,9 +46,30 @@ unsigned long totalhigh_pages; long nr_swap_pages; int sysctl_lower_zone_protection = 0; +/* Bean counters for the per-type buddy allocator */ +int fallback_count[ALLOC_TYPES] = { 0, 0, 0}; +int global_steal=0; +int global_refill=0; +int kernnorclm_count=0; +int kernrclm_count=0; +int userrclm_count=0; + EXPORT_SYMBOL(totalram_pages); EXPORT_SYMBOL(nr_swap_pages); +/** + * The allocator tries to put allocations of the same type in the + * same 2^MAX_ORDER blocks of pages. When memory is low, this may + * not be possible so this describes what order they should fall + * back on + */ +int fallback_allocs[ALLOC_TYPES][ALLOC_TYPES] = { + { ALLOC_KERNNORCLM, ALLOC_KERNRCLM, ALLOC_USERRCLM }, + { ALLOC_KERNRCLM, ALLOC_KERNNORCLM, ALLOC_USERRCLM }, + { ALLOC_USERRCLM, ALLOC_KERNNORCLM, ALLOC_KERNRCLM } +}; + + /* * Used by page_zone() to look up the address of the struct zone whose * id is encoded in the upper bits of page->flags @@ -57,6 +78,7 @@ struct zone *zone_table[1 << (ZONES_SHIF EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static char *type_names[ALLOC_TYPES] = { "KernNoRclm", "KernRclm", "UserRclm"}; int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; @@ -103,6 +125,48 @@ static void bad_page(const char *functio tainted |= TAINT_BAD_PAGE; } +/* + * Return what type of use the 2^MAX_ORDER block of pages is in use for + * that the given page is part of + */ +static int get_pageblock_type(struct page *page) { + struct zone *zone = page_zone(page); + int pageidx = (page - zone->zone_mem_map) >> MAX_ORDER; + int bitidx = pageidx * 2; + + /* Bit 1 will be set if the block is kernel reclaimable */ + if (test_bit(bitidx,zone->free_area_usemap)) return ALLOC_KERNRCLM; + + /* Bit 2 will be set if the block is user reclaimable */ + if (test_bit(bitidx+1, zone->free_area_usemap)) return ALLOC_USERRCLM; + + return ALLOC_KERNNORCLM; +} + +static void set_pageblock_type(struct page *page, int type) { + int bit1, bit2; + struct zone *zone = page_zone(page); + int pageidx = (page - zone->zone_mem_map) >> MAX_ORDER; + int bitidx = pageidx * 2; + bit1 = bit2 = 0; + + if (type == ALLOC_KERNRCLM) { + set_bit(bitidx, zone->free_area_usemap); + clear_bit(bitidx+1, zone->free_area_usemap); + return; + } + + if (type == ALLOC_USERRCLM) { + clear_bit(bitidx, zone->free_area_usemap); + set_bit(bitidx+1, zone->free_area_usemap); + return; + } + + clear_bit(bitidx, zone->free_area_usemap); + clear_bit(bitidx+1, zone->free_area_usemap); + +} + #ifndef CONFIG_HUGETLB_PAGE #define prep_compound_page(page, order) do { } while (0) #define destroy_compound_page(page, order) do { } while (0) @@ -231,6 +295,7 @@ static inline void __free_pages_bulk (st unsigned long page_idx; struct page *coalesced; int order_size = 1 << order; + struct free_area *area; if (unlikely(order)) destroy_compound_page(page, order); @@ -240,9 +305,12 @@ static inline void __free_pages_bulk (st BUG_ON(page_idx & (order_size - 1)); BUG_ON(bad_range(zone, page)); + /* Select the area to use for freeing based on the type */ + struct free_area *freelist = + zone->free_area_lists[get_pageblock_type(page)]; + zone->free_pages += order_size; while (order < MAX_ORDER-1) { - struct free_area *area; struct page *buddy; int buddy_idx; @@ -254,16 +322,29 @@ static inline void __free_pages_bulk (st break; /* Move the buddy up one level. */ list_del(&buddy->lru); - area = zone->free_area + order; + area = freelist + order; area->nr_free--; rmv_page_order(buddy); page_idx &= buddy_idx; order++; } + + /* + * If a MAX_ORDER block of pages is being freed, it is + * no longer reserved for a particular type of allocation + * so put it in the global list + */ + if (order >= MAX_ORDER-1) { + area = &(zone->free_area_global); + global_refill++; + } else { + area = freelist + order; + } + coalesced = base + page_idx; set_page_order(coalesced, order); - list_add(&coalesced->lru, &zone->free_area[order].free_list); - zone->free_area[order].nr_free++; + list_add(&coalesced->lru, &area->free_list); + area->nr_free++; } static inline void free_pages_check(const char *function, struct page *page) @@ -310,6 +391,7 @@ free_pages_bulk(struct zone *zone, int c zone->pages_scanned = 0; while (!list_empty(list) && count--) { page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); __free_pages_bulk(page, base, zone, order); @@ -420,14 +502,36 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *__rmqueue(struct zone *zone, unsigned int order, int flags) { struct free_area * area; unsigned int current_order; struct page *page; + int global_split=0; + + /* Select area to use based on gfp_flags */ + int alloctype; + int retry_count=0; + if (flags & __GFP_USERRCLM) { + alloctype = ALLOC_USERRCLM; + userrclm_count++; + } + else if (flags & __GFP_KERNRCLM) { + alloctype = ALLOC_KERNRCLM; + kernrclm_count++; + } else { + alloctype = ALLOC_KERNNORCLM; + kernnorclm_count++; + } + /* Ok, pick the fallback order based on the type */ + int *fallback_list = fallback_allocs[alloctype]; + +retry: for (current_order = order; current_order < MAX_ORDER; ++current_order) { - area = zone->free_area + current_order; + alloctype = fallback_list[retry_count]; + area = zone->free_area_lists[alloctype] + current_order; + if (list_empty(&area->free_list)) continue; @@ -439,6 +543,34 @@ static struct page *__rmqueue(struct zon return expand(zone, page, order, current_order, area); } + /* Take from the global pool if this is the first attempt */ + if (!global_split && !list_empty(&(zone->free_area_global.free_list))){ + /* + * Remove a MAX_ORDER block from the global pool and add + * it to the list of desired alloc_type + */ + page = list_entry(zone->free_area_global.free_list.next, + struct page, lru); + list_del(&page->lru); + list_add(&page->lru, + &(zone->free_area_lists[alloctype][MAX_ORDER-1].free_list)); + global_steal++; + global_split=1; + + /* Mark this block of pages as for use with this alloc type */ + set_pageblock_type(page, alloctype); + + goto retry; + } + + /* + * Here, the alloc type lists has been depleted as well as the global + * pool, so fallback + */ + retry_count++; + fallback_count[alloctype]++; + if (retry_count != ALLOC_TYPES) goto retry; + return NULL; } @@ -448,7 +580,8 @@ static struct page *__rmqueue(struct zon * Returns the number of new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) + unsigned long count, struct list_head *list, + int gfp_flags) { unsigned long flags; int i; @@ -457,7 +590,7 @@ static int rmqueue_bulk(struct zone *zon spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, gfp_flags); if (page == NULL) break; allocated++; @@ -493,7 +626,7 @@ static void __drain_pages(unsigned int c void mark_free_pages(struct zone *zone) { unsigned long zone_pfn, flags; - int order; + int order, type; struct list_head *curr; if (!zone->spanned_pages) @@ -503,14 +636,17 @@ void mark_free_pages(struct zone *zone) for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); - for (order = MAX_ORDER - 1; order >= 0; --order) - list_for_each(curr, &zone->free_area[order].free_list) { - unsigned long start_pfn, i; + for (type=0; type < ALLOC_TYPES; type++) { - start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area_lists[type][order].free_list) { + unsigned long start_pfn, i; + + start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); - for (i=0; i < (1<lock, flags); } @@ -612,14 +748,15 @@ buffered_rmqueue(struct zone *zone, int struct page *page = NULL; int cold = !!(gfp_flags & __GFP_COLD); - if (order == 0) { + if (order == 0 && (gfp_flags & __GFP_USERRCLM)) { struct per_cpu_pages *pcp; pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + pcp->batch, &pcp->list, + gfp_flags); if (pcp->count) { page = list_entry(pcp->list.next, struct page, lru); list_del(&page->lru); @@ -631,7 +768,7 @@ buffered_rmqueue(struct zone *zone, int if (page == NULL) { spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, gfp_flags); spin_unlock_irqrestore(&zone->lock, flags); } @@ -669,7 +806,11 @@ int zone_watermark_ok(struct zone *z, in return 0; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; + free_pages -= ( + z->free_area_lists[ALLOC_KERNNORCLM][o].nr_free + + z->free_area_lists[ALLOC_KERNRCLM][o].nr_free + + z->free_area_lists[ALLOC_USERRCLM][o].nr_free + ) << o; /* Require fewer higher order pages to be free */ min >>= 1; @@ -1124,6 +1265,7 @@ void show_free_areas(void) unsigned long inactive; unsigned long free; struct zone *zone; + int type; for_each_zone(zone) { show_node(zone); @@ -1216,8 +1358,10 @@ void show_free_areas(void) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr = zone->free_area[order].nr_free; - total += nr << order; + for (type=0; type < ALLOC_TYPES; type++) { + nr = zone->free_area_lists[type][order].nr_free; + total += nr << order; + } printk("%lu*%lukB ", nr, K(1UL) << order); } spin_unlock_irqrestore(&zone->lock, flags); @@ -1515,10 +1659,22 @@ void zone_init_free_lists(struct pglist_ unsigned long size) { int order; - for (order = 0; order < MAX_ORDER ; order++) { - INIT_LIST_HEAD(&zone->free_area[order].free_list); - zone->free_area[order].nr_free = 0; + int type; + struct free_area *area; + + /* Initialse the three size ordered lists of free_areas */ + for (type=0; type < ALLOC_TYPES; type++) { + for (order = 0; order < MAX_ORDER; order++) { + area = zone->free_area_lists[type]; + + INIT_LIST_HEAD(&area[order].free_list); + area[order].nr_free = 0; + } } + + /* Initialise the global pool of 2^size pages */ + INIT_LIST_HEAD(&zone->free_area_global.free_list); + zone->free_area_global.nr_free=0; } #ifndef __HAVE_ARCH_MEMMAP_INIT @@ -1539,6 +1695,7 @@ static void __init free_area_init_core(s const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); int cpu, nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + unsigned long usemapsize; pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); @@ -1637,6 +1794,22 @@ static void __init free_area_init_core(s zone_start_pfn += size; zone_init_free_lists(pgdat, zone, zone->spanned_pages); + + /* Calculate size of required bitmap */ + /* - Number of MAX_ORDER blocks in the zone */ + usemapsize = (size + (1 << MAX_ORDER)) >> MAX_ORDER; + + /* - Two bits to record what type of block it is */ + usemapsize = (usemapsize * 2 + 8) / 8; + + zone->free_area_usemap = + (unsigned long *)alloc_bootmem_node(pgdat, usemapsize); + + memset((unsigned long *)zone->free_area_usemap, + ALLOC_KERNNORCLM, usemapsize); + + printk(KERN_DEBUG " %s zone: %lu pages, %lu real pages, usemap size:%lu\n", + zone_names[j], size, realsize, usemapsize); } } @@ -1714,19 +1887,90 @@ static int frag_show(struct seq_file *m, struct zone *zone; struct zone *node_zones = pgdat->node_zones; unsigned long flags; - int order; + int order, type; + struct list_head *elem; - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) - continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } + /* Show global fragmentation statistics */ + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + unsigned long nr_bufs = 0; + for (order = 0; order < MAX_ORDER-1; ++order) { + nr_bufs = 0; + + for (type=0; type < ALLOC_TYPES; type++) { + list_for_each(elem, &(zone->free_area_lists[type][order].free_list)) + ++nr_bufs; + } + seq_printf(m, "%6lu ", nr_bufs); + } + + /* Scan global list */ + nr_bufs = 0; + list_for_each(elem, &(zone->free_area_global.free_list)) + ++nr_bufs; + seq_printf(m, "%6lu ", nr_bufs); + + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + + /* Show statistics for each allocation type */ + seq_printf(m, "\nPer-allocation-type statistics"); + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!zone->present_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + unsigned long nr_bufs = 0; + for (type=0; type < ALLOC_TYPES; type++) { + seq_printf(m, "\nNode %d, zone %8s, type %10s", + pgdat->node_id, zone->name, + type_names[type]); + struct list_head *elem; + for (order = 0; order < MAX_ORDER; ++order) { + nr_bufs = 0; + + list_for_each(elem, &(zone->free_area_lists[type][order].free_list)) + ++nr_bufs; + seq_printf(m, "%6lu ", nr_bufs); + } + } + + /* Scan global list */ + seq_printf(m, "\n"); + seq_printf(m, "Node %d, zone %8s, type %10s", + pgdat->node_id, zone->name, + "MAX_ORDER"); + nr_bufs = 0; + list_for_each(elem, &(zone->free_area_global.free_list)) + ++nr_bufs; + seq_printf(m, "%6lu ", nr_bufs); + + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + + /* Show bean counters */ + seq_printf(m, "\nGlobal beancounters\n"); + seq_printf(m, "Global steals: %d\n", global_steal); + seq_printf(m, "Global refills: %d\n", global_refill); + seq_printf(m, "KernNoRclm allocs: %d\n", kernnorclm_count); + seq_printf(m, "KernRclm allocs: %d\n", kernrclm_count); + seq_printf(m, "UserRclm allocs: %d\n", userrclm_count); + seq_printf(m, "%-10s Fallback count: %d\n", type_names[0], + fallback_count[0]); + seq_printf(m, "%-10s Fallback count: %d\n", type_names[1], + fallback_count[1]); + seq_printf(m, "%-10s Fallback count: %d\n", type_names[2], + fallback_count[2]); + + + return 0; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: aart@kvack.org