[RFC][PATCH 1/8] Create Critical Page Pool

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Matthew Dobson <colpatch@us.ibm.com>
To: linux-kernel@vger.kernel.org
Cc: Linux Memory Management <linux-mm@kvack.org>
Subject: [RFC][PATCH 1/8] Create Critical Page Pool
Date: Fri, 18 Nov 2005 11:36:03 -0800	[thread overview]
Message-ID: <437E2D23.10109@us.ibm.com> (raw)
In-Reply-To: <437E2C69.4000708@us.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 77 bytes --]

Create the critical page pool and it's associated /proc/sys/vm/ file.

-Matt

[-- Attachment #2: critical_pool.patch --]
[-- Type: text/x-patch, Size: 10222 bytes --]

Implement a Critical Page Pool:

* Write a number of pages into /proc/sys/vm/critical_pages
* These pages will be reserved for 'critical' allocations, signified
     by the __GFP_CRITICAL flag passed to an allocation request.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>

Index: linux-2.6.15-rc1+critical_pool/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/Documentation/sysctl/vm.txt	2005-11-15 13:47:14.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/Documentation/sysctl/vm.txt	2005-11-17 16:27:33.108545112 -0800
@@ -26,6 +26,7 @@ Currently, these files are in /proc/sys/
 - min_free_kbytes
 - laptop_mode
 - block_dump
+- critical_pages
 
 ==============================================================
 
@@ -102,3 +103,12 @@ This is used to force the Linux VM to ke
 of kilobytes free.  The VM uses this number to compute a pages_min
 value for each lowmem zone in the system.  Each lowmem zone gets 
 a number of reserved free pages based proportionally on its size.
+
+==============================================================
+
+critical_pages:
+
+This is used to force the Linux VM to reserve a pool of pages for
+emergency (__GFP_CRITICAL) allocations.  Allocations with this flag
+MUST succeed.
+The number written into this file is the number of pages to reserve.
Index: linux-2.6.15-rc1+critical_pool/include/linux/gfp.h
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/include/linux/gfp.h	2005-11-15 13:46:37.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/include/linux/gfp.h	2005-11-17 16:28:28.375143312 -0800
@@ -41,6 +41,7 @@ struct vm_area_struct;
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_NORECLAIM  ((__force gfp_t)0x20000u) /* No realy zone reclaim during allocation */
 #define __GFP_HARDWALL   ((__force gfp_t)0x40000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_CRITICAL	((__force gfp_t)0x80000u) /* Critical allocation. MUST succeed! */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -49,7 +50,8 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL)
+			__GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL| \
+			__GFP_CRITICAL)
 
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
@@ -58,6 +60,8 @@ struct vm_area_struct;
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
 #define GFP_HIGHUSER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
 			 __GFP_HIGHMEM)
+#define GFP_ATOMIC_CRIT	(GFP_ATOMIC | __GFP_CRITICAL)
+#define GFP_KERNEL_CRIT	(GFP_KERNEL | __GFP_CRITICAL)
 
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
Index: linux-2.6.15-rc1+critical_pool/include/linux/sysctl.h
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/include/linux/sysctl.h	2005-11-15 13:46:38.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/include/linux/sysctl.h	2005-11-17 16:27:33.111544656 -0800
@@ -181,6 +181,7 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
+	VM_CRITICAL_PAGES=30,	/* # of pages to reserve for __GFP_CRITICAL allocs */
 };
 
 
Index: linux-2.6.15-rc1+critical_pool/kernel/sysctl.c
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/kernel/sysctl.c	2005-11-15 13:47:25.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/kernel/sysctl.c	2005-11-17 16:27:33.116543896 -0800
@@ -849,6 +849,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
+	{
+		.ctl_name	= VM_CRITICAL_PAGES,
+		.procname	= "critical_pages",
+		.data		= &critical_pages,
+		.maxlen		= sizeof(critical_pages),
+		.mode		= 0644,
+		.proc_handler	= &critical_pages_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.15-rc1+critical_pool/mm/page_alloc.c
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/mm/page_alloc.c	2005-11-15 13:47:26.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/mm/page_alloc.c	2005-11-17 16:27:33.120543288 -0800
@@ -53,6 +53,65 @@ unsigned long totalram_pages __read_most
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
+/* The number of pages to maintain in the critical page pool */
+int critical_pages = 0;
+
+/* Critical Page Pool control structure */
+static struct critical_pool {
+	unsigned int free;
+	spinlock_t lock;
+	struct list_head pages;
+} critical_pool = {
+	.free	= 0,
+	.lock	= SPIN_LOCK_UNLOCKED,
+	.pages	= LIST_HEAD_INIT(critical_pool.pages),
+};
+
+/* MCD - This needs to be arch specific */
+#define CRITICAL_POOL_GFPMASK	(GFP_KERNEL)
+
+static inline int is_critical_pool_full(void)
+{
+	return critical_pool.free >= critical_pages;
+}
+
+static inline struct page *get_critical_page(gfp_t gfpmask)
+{
+	struct page *page = NULL;
+
+	spin_lock(&critical_pool.lock);
+	if (!list_empty(&critical_pool.pages)) {
+		/* Grab the next free critical pool page */
+		page = list_entry(critical_pool.pages.next, struct page, lru);
+		list_del(&page->lru);
+		critical_pool.free--;
+	}
+	spin_unlock(&critical_pool.lock);
+
+	return page;
+}
+
+static inline int put_critical_page(struct page *page)
+{
+	int ret = 0;
+
+	spin_lock(&critical_pool.lock);
+	if (!is_critical_pool_full()) {
+		/*
+		 * We snached this page away in the process of being freed, so
+		 * we must re-increment it's count so we don't cause problems
+		 * when we hand it back out to a future __GFP_CRITICAL alloc.
+		 */
+		BUG_ON(!get_page_testone(page));
+		list_add(&page->lru, &critical_pool.pages);
+		critical_pool.free++;
+		ret = 1;
+	}
+	spin_unlock(&critical_pool.lock);
+
+	return ret;
+}
+
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -305,6 +364,10 @@ static inline void __free_pages_bulk (st
 
 	if (unlikely(order))
 		destroy_compound_page(page, order);
+	else if (!is_critical_pool_full())
+		/* If the critical pool isn't full, add this page to it */
+		if (put_critical_page(page))
+			return;
 
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
@@ -984,6 +1047,20 @@ rebalance:
 	}
 
 nopage:
+	/*
+	 * This is the LAST DITCH effort.  We maintain a pool of 'critical'
+	 * pages specifically for allocation requests marked __GFP_CRITICAL.
+	 * Rather than fail one of these allocations, take a page (if any)
+	 * from the critical pool.
+	 */
+	if (gfp_mask & __GFP_CRITICAL) {
+		page = get_critical_page(gfp_mask);
+		if (page) {
+			z = page_zone(page);
+			goto got_pg;
+		}
+	}
+
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		printk(KERN_WARNING "%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
@@ -2522,6 +2599,68 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
+static inline void fill_critical_pool(int num)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		page = alloc_page(CRITICAL_POOL_GFPMASK);
+		if (!page)
+			return;
+		spin_lock(&critical_pool.lock);
+		list_add(&page->lru, &critical_pool.pages);
+		critical_pool.free++;
+		spin_unlock(&critical_pool.lock);
+	}
+}
+
+static inline void drain_critical_pool(int num)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		spin_lock(&critical_pool.lock);
+		BUG_ON(critical_pool.free < 0);
+		if (list_empty(&critical_pool.pages) || !critical_pool.free) {
+			spin_unlock(&critical_pool.lock);
+			break;
+		}
+			
+		page = list_entry(critical_pool.pages.next, struct page, lru);
+		list_del(&page->lru);
+		critical_pool.free--;
+		spin_unlock(&critical_pool.lock);
+			
+		__free_pages(page, 0);
+	}
+}
+
+/*
+ * critical_pages_sysctl_handler - handle writes to /proc/sys/vm/critical_pages
+ *     Whenever this file changes, add/remove pages to/from the critical page
+ *     pool.
+ */
+int critical_pages_sysctl_handler(ctl_table *table, int write,
+				   struct file *file, void __user *buffer,
+				   size_t *length, loff_t *ppos)
+{
+	int num;
+
+	proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!write)
+		return 0;
+
+	num = critical_pages - critical_pool.free;
+	if (num > 0)
+		fill_critical_pool(num);
+	else if (num < 0)
+		drain_critical_pool(-num);
+
+	return 0;
+}
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
Index: linux-2.6.15-rc1+critical_pool/include/linux/mmzone.h
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/include/linux/mmzone.h	2005-11-15 13:46:38.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/include/linux/mmzone.h	2005-11-17 16:27:33.121543136 -0800
@@ -430,6 +430,8 @@ int min_free_kbytes_sysctl_handler(struc
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int critical_pages_sysctl_handler(struct ctl_table *, int, struct file *,
+				  void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
Index: linux-2.6.15-rc1+critical_pool/include/linux/mm.h
===================================================================
--- linux-2.6.15-rc1+critical_pool.orig/include/linux/mm.h	2005-11-15 13:46:36.000000000 -0800
+++ linux-2.6.15-rc1+critical_pool/include/linux/mm.h	2005-11-17 16:27:33.125542528 -0800
@@ -32,6 +32,8 @@ extern int sysctl_legacy_va_layout;
 #define sysctl_legacy_va_layout 0
 #endif
 
+extern int critical_pages;
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>

next prev parent reply	other threads:[~2005-11-18 19:36 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-11-18 19:32 [RFC][PATCH 0/8] " Matthew Dobson
2005-11-18 19:36 ` Matthew Dobson [this message]
2005-11-19  0:08   ` [RFC][PATCH 1/8] Create " Paul Jackson
2005-11-21  5:50     ` Matthew Dobson
2005-11-21  5:54       ` Paul Jackson
2005-11-18 19:36 ` [RFC][PATCH 2/8] Create emergency trigger Matthew Dobson
2005-11-19  0:21   ` Paul Jackson
2005-11-21  5:51     ` Matthew Dobson
2005-11-18 19:40 ` [RFC][PATCH 3/8] Slab cleanup Matthew Dobson
2005-11-18 19:41 ` [RFC][PATCH 4/8] Fix a bug in scsi_get_command Matthew Dobson
2005-11-18 19:43 ` [RFC][PATCH 5/8] get_object/return_object Matthew Dobson
2005-11-18 19:44 ` [RFC][PATCH 6/8] slab_destruct Matthew Dobson
2005-11-18 19:44 ` [RFC][PATCH 0/8] Critical Page Pool Avi Kivity
2005-11-18 19:51   ` Matthew Dobson
2005-11-18 20:42     ` Avi Kivity
2005-11-19  0:10       ` Paul Jackson
2005-11-21  5:36       ` Matthew Dobson
2005-11-18 19:45 ` [RFC][PATCH 7/8] __cache_grow() Matthew Dobson
2005-11-18 19:47 ` [RFC][PATCH 8/8] Add support critical pool support to the slab allocator Matthew Dobson
2005-11-18 19:56 ` [RFC][PATCH 0/8] Critical Page Pool Chris Wright
2005-11-21  5:47   ` Matthew Dobson
2005-11-21 13:29     ` Pavel Machek
2005-12-06 22:54       ` Matthew Dobson
2005-12-10  8:39         ` Pavel Machek
2005-11-20  7:45 ` Keith Owens
2005-11-21  5:53   ` Matthew Dobson
2005-11-20 23:04 ` Pavel Machek
2005-11-21  5:58   ` Matthew Dobson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=437E2D23.10109@us.ibm.com \
    --to=colpatch@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox