From: Mel Gorman <mel@csn.ul.ie>
To: Paul Jackson <pj@sgi.com>
Cc: jschopp@austin.ibm.com, haveblue@us.ibm.com,
Linux Memory Management List <linux-mm@kvack.org>
Subject: Re: [Fwd: [PATCH 2/4] cpusets new __GFP_HARDWALL flag]
Date: Mon, 18 Jul 2005 13:32:57 +0100 (IST) [thread overview]
Message-ID: <Pine.LNX.4.58.0507181328480.2899@skynet> (raw)
In-Reply-To: <20050714040613.10b244ee.pj@sgi.com>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 1425 bytes --]
On Thu, 14 Jul 2005, Paul Jackson wrote:
> Mel wrote:
> > Well, what would people feel is obvious?
>
> The lines that you (Mel) add that I am puzzling over ways to clarify are
> these added lines in gfp.h:
>
> +#define __GFP_KERNRCLM 0x20000u /* Kernel page that is easily reclaimable */
> +#define __GFP_USERRCLM 0x40000u /* User is a userspace user */
>
> +#define __GFP_TYPE_SHIFT 17 /* Translate RCLM flags to array index */
>
> and perhaps these added lines in mmzone.h:
>
> +/* Page allocations are divided into these types */
> +#define ALLOC_TYPES 4
> +#define ALLOC_KERNNORCLM 0
> +#define ALLOC_KERNRCLM 1
> +#define ALLOC_USERRCLM 2
> +#define ALLOC_FALLBACK 3
> +
> +/* Number of bits required to encode the type */
> +#define BITS_PER_ALLOC_TYPE 2
>
> It didn't jump out at me, first pass, that these two GFP bits
> were a 2 bit field, not 2 separate and independent bits. The name
> GFP_TYPE_SHIFT is vague. There are some redundant (interdependent)
> defines here.
>
> How about (just brainstorming here) something like the following:
>
That makes sense to me. Taking into account other threads, attached are
patches 01 and 02 from Joels patchset with the different namings and
comments. The main changes are the renaming of __GFP_USERRCLM to
__GFP_USER to be neutral and comments explaining how the RCLM flags are
tied together.
Is this any better?
[-- Attachment #2: 01_gfp_flags --]
[-- Type: TEXT/PLAIN, Size: 5989 bytes --]
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1/fs/buffer.c linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/buffer.c
--- linux-2.6.13-rc3-mhp1/fs/buffer.c 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/buffer.c 2005-07-18 11:43:11.000000000 +0100
@@ -1119,7 +1119,8 @@ grow_dev_page(struct block_device *bdev,
struct page *page;
struct buffer_head *bh;
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, index,
+ GFP_NOFS | __GFP_USER);
if (!page)
return NULL;
@@ -3044,7 +3045,8 @@ static void recalc_bh_state(void)
struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
{
- struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
+ struct buffer_head *ret = kmem_cache_alloc(bh_cachep,
+ gfp_flags|__GFP_KERNRCLM);
if (ret) {
preempt_disable();
__get_cpu_var(bh_accounting).nr++;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1/fs/dcache.c linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/dcache.c
--- linux-2.6.13-rc3-mhp1/fs/dcache.c 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/dcache.c 2005-07-18 11:43:11.000000000 +0100
@@ -719,7 +719,7 @@ struct dentry *d_alloc(struct dentry * p
struct dentry *dentry;
char *dname;
- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL|__GFP_KERNRCLM);
if (!dentry)
return NULL;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1/fs/ext2/super.c linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/ext2/super.c
--- linux-2.6.13-rc3-mhp1/fs/ext2/super.c 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/ext2/super.c 2005-07-18 11:43:11.000000000 +0100
@@ -138,7 +138,8 @@ static kmem_cache_t * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, SLAB_KERNEL);
+ ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep,
+ SLAB_KERNEL|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT2_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1/fs/ext3/super.c linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/ext3/super.c
--- linux-2.6.13-rc3-mhp1/fs/ext3/super.c 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/ext3/super.c 2005-07-18 11:43:11.000000000 +0100
@@ -440,7 +440,7 @@ static struct inode *ext3_alloc_inode(st
{
struct ext3_inode_info *ei;
- ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS);
+ ei = kmem_cache_alloc(ext3_inode_cachep, SLAB_NOFS|__GFP_KERNRCLM);
if (!ei)
return NULL;
#ifdef CONFIG_EXT3_FS_POSIX_ACL
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1/fs/ntfs/inode.c linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/ntfs/inode.c
--- linux-2.6.13-rc3-mhp1/fs/ntfs/inode.c 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-01_gfp_flags/fs/ntfs/inode.c 2005-07-18 11:43:11.000000000 +0100
@@ -318,7 +318,7 @@ struct inode *ntfs_alloc_big_inode(struc
ntfs_debug("Entering.");
ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache,
- SLAB_NOFS);
+ SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -343,7 +343,8 @@ static inline ntfs_inode *ntfs_alloc_ext
ntfs_inode *ni;
ntfs_debug("Entering.");
- ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS);
+ ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache,
+ SLAB_NOFS|__GFP_KERNRCLM);
if (likely(ni != NULL)) {
ni->state = 0;
return ni;
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1/include/linux/gfp.h linux-2.6.13-rc3-mhp1-01_gfp_flags/include/linux/gfp.h
--- linux-2.6.13-rc3-mhp1/include/linux/gfp.h 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-01_gfp_flags/include/linux/gfp.h 2005-07-18 11:43:11.000000000 +0100
@@ -41,21 +41,37 @@ struct vm_area_struct;
#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */
#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */
-#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
+/*
+ * Allocation type modifiers - Group the allocation types together if possible
+ *
+ * __GFP_USER: Allocation for a user page or a buffer page.
+ *
+ * __GFP_KERNRCLM: Kernel allocation that is either very short-lived or
+ * reclaimable like inode caches
+ *
+ * __GFP_RCLM_BITS: Sum of all the reclaimable bits.
+ */
+
+#define __GFP_USER 0x40000u /* Easily reclaimable userspace page */
+#define __GFP_KERNRCLM 0x80000u /* Kernel page that is easily reclaimable */
+#define __GFP_RCLM_BITS (__GFP_USER|__GFP_KERNRCLM)
+
+#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */
#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
/* if you forget to add the bitmask here kernel will crash, period */
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_NORECLAIM)
+ __GFP_NOMEMALLOC|__GFP_KERNRCLM|__GFP_USER)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS)
-#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_USER)
+#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM | \
+ __GFP_USER)
/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
[-- Attachment #3: 02_more_defines --]
[-- Type: TEXT/PLAIN, Size: 5859 bytes --]
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1-01_gfp_flags/include/linux/mmzone.h linux-2.6.13-rc3-mhp1-02_more_defines/include/linux/mmzone.h
--- linux-2.6.13-rc3-mhp1-01_gfp_flags/include/linux/mmzone.h 2005-07-18 12:05:27.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-02_more_defines/include/linux/mmzone.h 2005-07-18 12:24:33.000000000 +0100
@@ -21,6 +21,20 @@
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif
+/*
+ * Reduce buddy heap fragmentation by keeping pages with similar
+ * reclaimability behavior together. The two bit field __GFP_RECLAIMBITS
+ * enumerates the following 4 kinds of page reclaimability:
+ */
+#define RCLM_TYPES 4
+#define RCLM_NORCLM 0
+#define RCLM_USER 1
+#define RCLM_KERN 2
+#define RCLM_FALLBACK 3
+
+#define RCLM_SHIFT 17 /* Shift __GFP_RECLAIMBITS to RCLM_* values */
+#define BITS_PER_RCLM_TYPE 2
+
struct free_area {
struct list_head free_list;
unsigned long nr_free;
@@ -137,8 +151,47 @@ struct zone {
* free areas of different sizes
*/
spinlock_t lock;
- struct free_area free_area[MAX_ORDER];
+ /*
+ * free_area to be removed in later patch as it is replaced by
+ * free_area_list
+ */
+ struct free_area free_area[MAX_ORDER];
+
+#ifndef CONFIG_SPARSEMEM
+ /*
+ * The map tracks what each 2^MAX_ORDER-1 sized block is being used for.
+ * Each 2^MAX_ORDER block have pages has BITS_PER_RCLM_TYPE bits in
+ * this map to remember what the block is for. When a page is freed,
+ * it's index within this bitmap is calculated in get_pageblock_type()
+ * This means that pages will always be freed into the correct list in
+ * free_area_lists
+ *
+ * The bits are set when a 2^MAX_ORDER block of pages is split
+ */
+ unsigned long *free_area_usemap;
+#endif
+
+ /*
+ * free_area_lists contains buddies of split MAX_ORDER blocks indexed
+ * by their intended allocation type, while free_area_global contains
+ * whole MAX_ORDER blocks that can be used for any allocation type.
+ */
+ struct free_area free_area_lists[RCLM_TYPES][MAX_ORDER];
+
+ /*
+ * A percentage of a zone is reserved for falling back to. Without
+ * a fallback, memory will slowly fragment over time meaning the
+ * placement policy only delays the fragmentation problem, not
+ * fixes it
+ */
+ unsigned long fallback_reserve;
+
+ /*
+ * When negative, 2^MAX_ORDER-1 sized blocks of pages will be reserved
+ * for fallbacks
+ */
+ long fallback_balance;
ZONE_PADDING(_pad1_)
@@ -230,6 +283,18 @@ struct zone {
} ____cacheline_maxaligned_in_smp;
+static inline void inc_reserve_count(struct zone* zone, int type)
+{
+ if(type == RCLM_FALLBACK)
+ zone->fallback_reserve++;
+}
+static inline void dec_reserve_count(struct zone* zone, int type)
+{
+ if(type == RCLM_FALLBACK && zone->fallback_reserve)
+ zone->fallback_reserve--;
+
+}
+
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -473,6 +538,9 @@ extern struct pglist_data contig_page_da
#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif
+#if ((SECTION_SIZE_BITS - MAX_ORDER) * BITS_PER_ALLOC) > 64
+#error free_area_usemap is not big enough
+#endif
struct page;
struct mem_section {
@@ -485,6 +553,7 @@ struct mem_section {
* before using it wrong.
*/
unsigned long section_mem_map;
+ u64 free_area_usemap;
};
extern struct mem_section mem_section[NR_MEM_SECTIONS];
@@ -536,6 +605,17 @@ static inline struct mem_section *__pfn_
return __nr_to_section(pfn_to_section_nr(pfn));
}
+static inline unsigned long *pfn_to_usemap(struct zone *zone, unsigned long pfn)
+{
+ return &__pfn_to_section(pfn)->free_area_usemap;
+}
+
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+ pfn &= (PAGES_PER_SECTION-1);
+ return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
+}
+
#define pfn_to_page(pfn) \
({ \
unsigned long __pfn = (pfn); \
@@ -572,6 +652,15 @@ static inline int pfn_valid(unsigned lon
void sparse_init(void);
#else
#define sparse_init() do {} while (0)
+static inline unsigned long *pfn_to_usemap(struct zone *zone, unsigned long pfn)
+{
+ return (zone->free_area_usemap);
+}
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+ pfn = pfn - zone->zone_start_pfn;
+ return (int)((pfn >> (MAX_ORDER-1)) * BITS_PER_RCLM_TYPE);
+}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
diff -rup -X /usr/src/patchset-0.5/bin//dontdiff linux-2.6.13-rc3-mhp1-01_gfp_flags/mm/page_alloc.c linux-2.6.13-rc3-mhp1-02_more_defines/mm/page_alloc.c
--- linux-2.6.13-rc3-mhp1-01_gfp_flags/mm/page_alloc.c 2005-07-13 05:46:46.000000000 +0100
+++ linux-2.6.13-rc3-mhp1-02_more_defines/mm/page_alloc.c 2005-07-18 12:27:09.000000000 +0100
@@ -65,6 +65,20 @@ EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
/*
+ * fallback_allocs contains the fallback types for low memory conditions
+ * where the preferred alloction type if not available.
+ */
+int fallback_allocs[RCLM_TYPES][RCLM_TYPES+1] = {
+ {RCLM_NORCLM, RCLM_FALLBACK, RCLM_KERN, RCLM_USER, -1},
+ {RCLM_KERN, RCLM_FALLBACK, RCLM_NORCLM, RCLM_USER, -1},
+ {RCLM_USER, RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, -1},
+ {RCLM_FALLBACK, RCLM_NORCLM, RCLM_KERN, RCLM_USER, -1}
+};
+static char *type_names[RCLM_TYPES] = { "Kernnel Unreclaimable",
+ "Kernel Reclaimable",
+ "User Reclaimable", "Fallback"};
+
+/*
* Used by page_zone() to look up the address of the struct zone whose
* id is encoded in the upper bits of page->flags
*/
next prev parent reply other threads:[~2005-07-18 12:32 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <1121101013.15095.19.camel@localhost>
2005-07-11 17:36 ` Joel Schopp
2005-07-11 17:49 ` Dave Hansen
2005-07-12 2:55 ` Paul Jackson
2005-07-12 5:24 ` Dave Hansen
2005-07-12 6:11 ` Paul Jackson
2005-07-12 13:05 ` Mel Gorman
2005-07-12 20:29 ` Paul Jackson
2005-07-13 11:15 ` Mel Gorman
2005-07-14 11:06 ` Paul Jackson
2005-07-18 12:32 ` Mel Gorman [this message]
2005-07-18 20:08 ` Joel Schopp
2005-07-27 8:29 ` Paul Jackson
2005-07-27 11:10 ` Mel Gorman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Pine.LNX.4.58.0507181328480.2899@skynet \
--to=mel@csn.ul.ie \
--cc=haveblue@us.ibm.com \
--cc=jschopp@austin.ibm.com \
--cc=linux-mm@kvack.org \
--cc=pj@sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox