From: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
To: ckrm-tech@lists.sourceforge.net
Cc: linux-mm@kvack.org, KUROSAWA Takahiro <kurosawa@valinux.co.jp>
Subject: [PATCH 1/2] Add the pzone
Date: Thu, 19 Jan 2006 17:04:38 +0900 (JST) [thread overview]
Message-ID: <20060119080413.24736.27946.sendpatchset@debian> (raw)
In-Reply-To: <20060119080408.24736.13148.sendpatchset@debian>
This patch implements the pzone (pseudo zone). A pzone can be used
for reserving pages in a zone. Pzones are implemented by extending
the zone structure and act almost the same as the conventional zones;
we can specify pzones in a zonelist for __alloc_pages() and the vmscan
code works on pzones with few modifications.
Signed-off-by: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
---
include/linux/gfp.h | 3
include/linux/mm.h | 49 ++
include/linux/mmzone.h | 118 ++++++
include/linux/swap.h | 2
mm/Kconfig | 6
mm/page_alloc.c | 845 +++++++++++++++++++++++++++++++++++++++++++++----
mm/shmem.c | 2
mm/vmscan.c | 75 +++-
8 files changed, 1020 insertions(+), 80 deletions(-)
diff -urNp a/include/linux/gfp.h b/include/linux/gfp.h
--- a/include/linux/gfp.h 2006-01-03 12:21:10.000000000 +0900
+++ b/include/linux/gfp.h 2006-01-19 15:23:42.000000000 +0900
@@ -47,6 +47,7 @@ struct vm_area_struct;
#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_NOLRU ((__force gfp_t)0x40000u) /* GFP_USER but will not be in LRU lists */
#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -55,7 +56,7 @@ struct vm_area_struct;
#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
- __GFP_NOMEMALLOC|__GFP_HARDWALL)
+ __GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_NOLRU)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_NOIO (__GFP_WAIT)
diff -urNp a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h 2006-01-03 12:21:10.000000000 +0900
+++ b/include/linux/mm.h 2006-01-19 15:23:00.000000000 +0900
@@ -397,6 +397,12 @@ void put_page(struct page *page);
* with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
* no space for node: | SECTION | ZONE | ... | FLAGS |
*/
+
+#ifdef CONFIG_PSEUDO_ZONE
+#define PZONE_BIT_WIDTH 1
+#else
+#define PZONE_BIT_WIDTH 0
+#endif
#ifdef CONFIG_SPARSEMEM
#define SECTIONS_WIDTH SECTIONS_SHIFT
#else
@@ -405,14 +411,15 @@ void put_page(struct page *page);
#define ZONES_WIDTH ZONES_SHIFT
-#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
+#if PZONE_BIT_WIDTH+SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
#define NODES_WIDTH NODES_SHIFT
#else
#define NODES_WIDTH 0
#endif
-/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
-#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
+/* Page flags: | [PZONE] | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
+#define PZONE_BIT_PGOFF ((sizeof(unsigned long)*8) - PZONE_BIT_WIDTH)
+#define SECTIONS_PGOFF (PZONE_BIT_PGOFF - SECTIONS_WIDTH)
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
@@ -431,6 +438,7 @@ void put_page(struct page *page);
* sections we define the shift as 0; that plus a 0 mask ensures
* the compiler will optimise away reference to them.
*/
+#define PZONE_BIT_PGSHIFT (PZONE_BIT_PGOFF * (PZONE_BIT_WIDTH != 0))
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
@@ -443,10 +451,11 @@ void put_page(struct page *page);
#endif
#define ZONETABLE_PGSHIFT ZONES_PGSHIFT
-#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
-#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#if PZONE_BIT_WIDTH+SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#error PZONE_BIT_WIDTH+SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
#endif
+#define PZONE_BIT_MASK ((1UL << PZONE_BIT_WIDTH) - 1)
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
@@ -454,12 +463,38 @@ void put_page(struct page *page);
static inline unsigned long page_zonenum(struct page *page)
{
- return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
+ return (page->flags >> ZONES_PGSHIFT) & (ZONES_MASK | PZONE_BIT_MASK);
}
struct zone;
extern struct zone *zone_table[];
+#ifdef CONFIG_PSEUDO_ZONE
+static inline int page_in_pzone(struct page *page)
+{
+ return (page->flags >> PZONE_BIT_PGSHIFT) & PZONE_BIT_MASK;
+}
+
+static inline struct zone *page_zone(struct page *page)
+{
+ int idx;
+
+ idx = (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
+ if (page_in_pzone(page))
+ return pzone_table[idx].zone;
+ return zone_table[idx];
+}
+
+static inline unsigned long page_to_nid(struct page *page)
+{
+ return page_zone(page)->zone_pgdat->node_id;
+}
+#else
+static inline int page_in_pzone(struct page *page)
+{
+ return 0;
+}
+
static inline struct zone *page_zone(struct page *page)
{
return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
@@ -473,6 +508,8 @@ static inline unsigned long page_to_nid(
else
return page_zone(page)->zone_pgdat->node_id;
}
+#endif
+
static inline unsigned long page_to_section(struct page *page)
{
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
diff -urNp a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h 2006-01-03 12:21:10.000000000 +0900
+++ b/include/linux/mmzone.h 2006-01-19 15:23:00.000000000 +0900
@@ -111,6 +111,15 @@ struct zone {
/* Fields commonly accessed by the page allocator */
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
+
+#ifdef CONFIG_PSEUDO_ZONE
+ /* Pseudo zone members: children list is protected by nr_zones_lock */
+ struct zone *parent;
+ struct list_head children;
+ struct list_head sibling;
+ int pzone_idx;
+#endif
+
/*
* We don't know if the memory that we're going to allocate will be freeable
* or/and it will be released eventually, so to avoid totally wasting several
@@ -336,7 +345,71 @@ unsigned long __init node_memmap_size_by
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
-#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
+#define zone_idx(zone) (real_zone(zone) - (zone)->zone_pgdat->node_zones)
+
+#ifdef CONFIG_PSEUDO_ZONE
+#define MAX_NR_PZONES 1024
+
+struct pzone_table {
+ struct zone *zone;
+ struct list_head list;
+};
+
+extern struct pzone_table pzone_table[];
+
+void read_lock_nr_zones(void);
+void read_unlock_nr_zones(void);
+struct zone *pzone_create(struct zone *z, char *name, int npages);
+void pzone_destroy(struct zone *z);
+int pzone_set_numpages(struct zone *z, int npages);
+
+static inline void zone_init_pzone_link(struct zone *z)
+{
+ z->parent = NULL;
+ INIT_LIST_HEAD(&z->children);
+ INIT_LIST_HEAD(&z->sibling);
+ z->pzone_idx = -1;
+}
+
+static inline int zone_is_pseudo(struct zone *z)
+{
+ return (z->parent != NULL);
+}
+
+static inline struct zone *real_zone(struct zone *z)
+{
+ if (z->parent)
+ return z->parent;
+ return z;
+}
+
+static inline struct zone *pzone_next_in_zone(struct zone *z)
+{
+ if (zone_is_pseudo(z)) {
+ if (z->sibling.next == &z->parent->children)
+ z = NULL;
+ else
+ z = list_entry(z->sibling.next, struct zone, sibling);
+ } else {
+ if (list_empty(&z->children))
+ z = NULL;
+ else
+ z = list_entry(z->children.next, struct zone, sibling);
+ }
+
+ return z;
+}
+
+#else
+#define MAX_PSEUDO_ZONES 0
+
+static inline void read_lock_nr_zones(void) {}
+static inline void read_unlock_nr_zones(void) {}
+static inline void zone_init_pzone_link(struct zone *z) {}
+
+static inline int zone_is_pseudo(struct zone *z) { return 0; }
+static inline struct zone *real_zone(struct zone *z) { return z; }
+#endif
/**
* for_each_pgdat - helper macro to iterate over all nodes
@@ -360,6 +433,19 @@ static inline struct zone *next_zone(str
{
pg_data_t *pgdat = zone->zone_pgdat;
+#ifdef CONFIG_PSEUDO_ZONE
+ if (zone_is_pseudo(zone)) {
+ if (zone->sibling.next != &zone->parent->children)
+ return list_entry(zone->sibling.next, struct zone,
+ sibling);
+ else
+ zone = zone->parent;
+ } else {
+ if (!list_empty(&zone->children))
+ return list_entry(zone->children.next, struct zone,
+ sibling);
+ }
+#endif
if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
zone++;
else if (pgdat->pgdat_next) {
@@ -371,6 +457,31 @@ static inline struct zone *next_zone(str
return zone;
}
+static inline struct zone *next_zone_in_node(struct zone *zone, int len)
+{
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+#ifdef CONFIG_PSEUDO_ZONE
+ if (zone_is_pseudo(zone)) {
+ if (zone->sibling.next != &zone->parent->children)
+ return list_entry(zone->sibling.next, struct zone,
+ sibling);
+ else
+ zone = zone->parent;
+ } else {
+ if (!list_empty(&zone->children))
+ return list_entry(zone->children.next, struct zone,
+ sibling);
+ }
+#endif
+ if (zone < pgdat->node_zones + len - 1)
+ zone++;
+ else
+ zone = NULL;
+
+ return zone;
+}
+
/**
* for_each_zone - helper macro to iterate over all memory zones
* @zone - pointer to struct zone variable
@@ -389,6 +500,9 @@ static inline struct zone *next_zone(str
#define for_each_zone(zone) \
for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
+#define for_each_zone_in_node(zone, pgdat, len) \
+ for (zone = pgdat->node_zones; zone; zone = next_zone_in_node(zone, len))
+
static inline int is_highmem_idx(int idx)
{
return (idx == ZONE_HIGHMEM);
@@ -406,11 +520,13 @@ static inline int is_normal_idx(int idx)
*/
static inline int is_highmem(struct zone *zone)
{
+ zone = real_zone(zone);
return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM;
}
static inline int is_normal(struct zone *zone)
{
+ zone = real_zone(zone);
return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
}
diff -urNp a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h 2006-01-03 12:21:10.000000000 +0900
+++ b/include/linux/swap.h 2006-01-19 15:23:00.000000000 +0900
@@ -171,6 +171,8 @@ extern int rotate_reclaimable_page(struc
extern void swap_setup(void);
/* linux/mm/vmscan.c */
+extern int isolate_lru_pages(int, struct list_head *, struct list_head *,
+ int *);
extern int try_to_free_pages(struct zone **, gfp_t);
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
extern int shrink_all_memory(int);
diff -urNp a/mm/Kconfig b/mm/Kconfig
--- a/mm/Kconfig 2006-01-03 12:21:10.000000000 +0900
+++ b/mm/Kconfig 2006-01-19 15:24:13.000000000 +0900
@@ -132,3 +132,9 @@ config SPLIT_PTLOCK_CPUS
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
default "4"
+
+config PSEUDO_ZONE
+ bool "Pseudo zone support"
+ help
+ This option provides pseudo zone creation from a non-pseudo zone.
+ Pseudo zones could be used for memory resource management.
diff -urNp a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c 2006-01-03 12:21:10.000000000 +0900
+++ b/mm/page_alloc.c 2006-01-19 15:23:00.000000000 +0900
@@ -309,6 +309,14 @@ static inline void __free_pages_bulk (st
BUG_ON(bad_range(zone, page));
zone->free_pages += order_size;
+
+ /*
+ * Do not concatenate a page in the pzone.
+ * Order>0 pages are never allocated from pzones (so far?).
+ */
+ if (unlikely(page_in_pzone(page)))
+ goto skip_buddy;
+
while (order < MAX_ORDER-1) {
unsigned long combined_idx;
struct free_area *area;
@@ -321,6 +329,7 @@ static inline void __free_pages_bulk (st
break;
if (!page_is_buddy(buddy, order))
break; /* Move the buddy up one level. */
+ BUG_ON(page_zone(page) != page_zone(buddy));
list_del(&buddy->lru);
area = zone->free_area + order;
area->nr_free--;
@@ -330,6 +339,8 @@ static inline void __free_pages_bulk (st
order++;
}
set_page_order(page, order);
+
+skip_buddy: /* Keep order and PagePrivate unset for pzone pages. */
list_add(&page->lru, &zone->free_area[order].free_list);
zone->free_area[order].nr_free++;
}
@@ -565,6 +576,7 @@ void drain_remote_pages(void)
unsigned long flags;
local_irq_save(flags);
+ read_lock_nr_zones();
for_each_zone(zone) {
struct per_cpu_pageset *pset;
@@ -582,30 +594,37 @@ void drain_remote_pages(void)
&pcp->list, 0);
}
}
+ read_unlock_nr_zones();
local_irq_restore(flags);
}
#endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
-static void __drain_pages(unsigned int cpu)
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PSEUDO_ZONE)
+static void __drain_zone_pages(struct zone *zone, int cpu)
{
- struct zone *zone;
+ struct per_cpu_pageset *pset;
int i;
- for_each_zone(zone) {
- struct per_cpu_pageset *pset;
-
- pset = zone_pcp(zone, cpu);
- for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
- struct per_cpu_pages *pcp;
+ pset = zone_pcp(zone, cpu);
+ for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+ struct per_cpu_pages *pcp;
- pcp = &pset->pcp[i];
- pcp->count -= free_pages_bulk(zone, pcp->count,
- &pcp->list, 0);
- }
+ pcp = &pset->pcp[i];
+ pcp->count -= free_pages_bulk(zone, pcp->count,
+ &pcp->list, 0);
}
}
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+static void __drain_pages(unsigned int cpu)
+{
+ struct zone *zone;
+
+ read_lock_nr_zones();
+ for_each_zone(zone)
+ __drain_zone_pages(zone, cpu);
+ read_unlock_nr_zones();
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU || CONFIG_PSEUDO_ZONE */
#ifdef CONFIG_PM
@@ -1080,8 +1099,10 @@ unsigned int nr_free_pages(void)
unsigned int sum = 0;
struct zone *zone;
+ read_lock_nr_zones();
for_each_zone(zone)
sum += zone->free_pages;
+ read_unlock_nr_zones();
return sum;
}
@@ -1331,6 +1352,7 @@ void show_free_areas(void)
unsigned long free;
struct zone *zone;
+ read_lock_nr_zones();
for_each_zone(zone) {
show_node(zone);
printk("%s per-cpu:", zone->name);
@@ -1427,6 +1449,7 @@ void show_free_areas(void)
spin_unlock_irqrestore(&zone->lock, flags);
printk("= %lukB\n", K(total));
}
+ read_unlock_nr_zones();
show_swap_cache_info();
}
@@ -1836,6 +1859,7 @@ static int __devinit process_zones(int c
{
struct zone *zone, *dzone;
+ read_lock_nr_zones();
for_each_zone(zone) {
zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
@@ -1845,6 +1869,7 @@ static int __devinit process_zones(int c
setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
}
+ read_unlock_nr_zones();
return 0;
bad:
@@ -1854,6 +1879,7 @@ bad:
kfree(dzone->pageset[cpu]);
dzone->pageset[cpu] = NULL;
}
+ read_unlock_nr_zones();
return -ENOMEM;
}
@@ -1862,12 +1888,14 @@ static inline void free_zone_pagesets(in
#ifdef CONFIG_NUMA
struct zone *zone;
+ read_lock_nr_zones();
for_each_zone(zone) {
struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
zone_pcp(zone, cpu) = NULL;
kfree(pset);
}
+ read_unlock_nr_zones();
#endif
}
@@ -2006,6 +2034,7 @@ static void __init free_area_init_core(s
zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+ zone_init_pzone_link(zone);
zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
@@ -2111,11 +2140,11 @@ static int frag_show(struct seq_file *m,
{
pg_data_t *pgdat = (pg_data_t *)arg;
struct zone *zone;
- struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
int order;
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ read_lock_nr_zones();
+ for_each_zone_in_node(zone, pgdat, MAX_NR_ZONES) {
if (!zone->present_pages)
continue;
@@ -2126,6 +2155,7 @@ static int frag_show(struct seq_file *m,
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
+ read_unlock_nr_zones();
return 0;
}
@@ -2143,10 +2173,10 @@ static int zoneinfo_show(struct seq_file
{
pg_data_t *pgdat = arg;
struct zone *zone;
- struct zone *node_zones = pgdat->node_zones;
unsigned long flags;
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+ read_lock_nr_zones();
+ for_each_zone_in_node(zone, pgdat, MAX_NR_ZONES) {
int i;
if (!zone->present_pages)
@@ -2234,6 +2264,7 @@ static int zoneinfo_show(struct seq_file
spin_unlock_irqrestore(&zone->lock, flags);
seq_putc(m, '\n');
}
+ read_unlock_nr_zones();
return 0;
}
@@ -2414,6 +2445,45 @@ static void setup_per_zone_lowmem_reserv
}
}
+static void setup_zone_pages_min(struct zone *zone, unsigned long lowmem_pages)
+{
+ unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+ unsigned long flags;
+ unsigned long tmp;
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ tmp = (pages_min * zone->present_pages) / lowmem_pages;
+ if (is_highmem(zone)) {
+ /*
+ * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+ * need highmem pages, so cap pages_min to a small
+ * value here.
+ *
+ * The (pages_high-pages_low) and (pages_low-pages_min)
+ * deltas controls asynch page reclaim, and so should
+ * not be capped for highmem.
+ */
+ int min_pages;
+
+ min_pages = zone->present_pages / 1024;
+ if (min_pages < SWAP_CLUSTER_MAX)
+ min_pages = SWAP_CLUSTER_MAX;
+ if (min_pages > 128)
+ min_pages = 128;
+ zone->pages_min = min_pages;
+ } else {
+ /*
+ * If it's a lowmem zone, reserve a number of pages
+ * proportionate to the zone's size.
+ */
+ zone->pages_min = tmp;
+ }
+
+ zone->pages_low = zone->pages_min + tmp / 4;
+ zone->pages_high = zone->pages_min + tmp / 2;
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
/*
* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
* that the pages_{min,low,high} values for each zone are set correctly
@@ -2421,51 +2491,19 @@ static void setup_per_zone_lowmem_reserv
*/
void setup_per_zone_pages_min(void)
{
- unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
- unsigned long flags;
+ read_lock_nr_zones();
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->present_pages;
}
- for_each_zone(zone) {
- unsigned long tmp;
- spin_lock_irqsave(&zone->lru_lock, flags);
- tmp = (pages_min * zone->present_pages) / lowmem_pages;
- if (is_highmem(zone)) {
- /*
- * __GFP_HIGH and PF_MEMALLOC allocations usually don't
- * need highmem pages, so cap pages_min to a small
- * value here.
- *
- * The (pages_high-pages_low) and (pages_low-pages_min)
- * deltas controls asynch page reclaim, and so should
- * not be capped for highmem.
- */
- int min_pages;
-
- min_pages = zone->present_pages / 1024;
- if (min_pages < SWAP_CLUSTER_MAX)
- min_pages = SWAP_CLUSTER_MAX;
- if (min_pages > 128)
- min_pages = 128;
- zone->pages_min = min_pages;
- } else {
- /*
- * If it's a lowmem zone, reserve a number of pages
- * proportionate to the zone's size.
- */
- zone->pages_min = tmp;
- }
-
- zone->pages_low = zone->pages_min + tmp / 4;
- zone->pages_high = zone->pages_min + tmp / 2;
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- }
+ for_each_zone(zone)
+ setup_zone_pages_min(zone, lowmem_pages);
+ read_unlock_nr_zones();
}
/*
@@ -2629,3 +2667,702 @@ void *__init alloc_large_system_hash(con
return table;
}
+
+#ifdef CONFIG_PSEUDO_ZONE
+
+#include <linux/mm_inline.h>
+
+struct pzone_table pzone_table[MAX_NR_PZONES];
+EXPORT_SYMBOL(pzone_table);
+
+static struct list_head pzone_freelist = LIST_HEAD_INIT(pzone_freelist);
+
+/*
+ * Protection between pzone_destroy() and pzone list lookups.
+ * These routines don't guard references from zonelists used in the page
+ * allocator.
+ * pzone maintainer (i.e. the class support routine) should remove the pzone
+ * from a zonelist (and probably make sure that there are no tasks in
+ * that class), then destroy the pzone.
+ */
+static spinlock_t nr_zones_lock = SPIN_LOCK_UNLOCKED;
+static int zones_readers = 0;
+static DECLARE_WAIT_QUEUE_HEAD(zones_waitqueue);
+
+static struct workqueue_struct *pzone_drain_wq;
+static DEFINE_PER_CPU(struct work_struct, pzone_drain_work);
+
+void read_lock_nr_zones(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&nr_zones_lock, flags);
+ zones_readers++;
+ spin_unlock_irqrestore(&nr_zones_lock, flags);
+}
+
+void read_unlock_nr_zones(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&nr_zones_lock, flags);
+ zones_readers--;
+ if ((zones_readers == 0) && waitqueue_active(&zones_waitqueue))
+ wake_up(&zones_waitqueue);
+ spin_unlock_irqrestore(&nr_zones_lock, flags);
+}
+
+static void write_lock_nr_zones(unsigned long *flagsp)
+{
+ DEFINE_WAIT(wait);
+
+ spin_lock_irqsave(&nr_zones_lock, *flagsp);
+ while (zones_readers) {
+ spin_unlock_irqrestore(&nr_zones_lock, *flagsp);
+ prepare_to_wait(&zones_waitqueue, &wait,
+ TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&zones_waitqueue, &wait);
+ spin_lock_irqsave(&nr_zones_lock, *flagsp);
+ }
+}
+
+static void write_unlock_nr_zones(unsigned long *flagsp)
+{
+ spin_unlock_irqrestore(&nr_zones_lock, *flagsp);
+}
+
+static int pzone_table_register(struct zone *z)
+{
+ struct pzone_table *t;
+ unsigned long flags;
+
+ write_lock_nr_zones(&flags);
+ if (list_empty(&pzone_freelist)) {
+ write_unlock_nr_zones(&flags);
+ return -ENOMEM;
+ }
+
+ t = list_entry(pzone_freelist.next, struct pzone_table, list);
+ list_del(&t->list);
+ z->pzone_idx = t - pzone_table;
+ t->zone = z;
+ write_unlock_nr_zones(&flags);
+
+ return 0;
+}
+
+static void pzone_table_unregister(struct zone *z)
+{
+ struct pzone_table *t;
+ unsigned long flags;
+
+ write_lock_nr_zones(&flags);
+ t = &pzone_table[z->pzone_idx];
+ t->zone = NULL;
+ list_add(&t->list, &pzone_freelist);
+ write_unlock_nr_zones(&flags);
+}
+
+static void pzone_parent_register(struct zone *z, struct zone *parent)
+{
+ unsigned long flags;
+
+ write_lock_nr_zones(&flags);
+ list_add(&z->sibling, &parent->children);
+ write_unlock_nr_zones(&flags);
+}
+
+static void pzone_parent_unregister(struct zone *z)
+{
+ unsigned long flags;
+
+ write_lock_nr_zones(&flags);
+ list_del(&z->sibling);
+ write_unlock_nr_zones(&flags);
+}
+
+/*
+ * pzone alloc/free routines
+ */
+#ifdef CONFIG_NUMA
+static int pzone_setup_pagesets(struct zone *z)
+{
+ struct per_cpu_pageset *pageset;
+ int batch;
+ int nid;
+ int i;
+
+ zone_pcp_init(z);
+
+ nid = z->zone_pgdat->node_id;
+ batch = zone_batchsize(z);
+
+ lock_cpu_hotplug();
+ for_each_online_cpu(i) {
+ pageset = kmalloc_node(sizeof(*pageset), GFP_KERNEL, nid);
+ if (!pageset)
+ goto bad;
+ z->pageset[i] = pageset;
+ setup_pageset(pageset, batch);
+ }
+ unlock_cpu_hotplug();
+
+ return 0;
+bad:
+ for (i = 0; i < NR_CPUS; i++) {
+ if (z->pageset[i] != &boot_pageset[i])
+ kfree(z->pageset[i]);
+ z->pageset[i] = NULL;
+ }
+ unlock_cpu_hotplug();
+
+ return -ENOMEM;
+}
+
+static void pzone_free_pagesets(struct zone *z)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (z->pageset[i] && (zone_pcp(z, i) != &boot_pageset[i])) {
+ BUG_ON(zone_pcp(z, i)->pcp[0].count != 0);
+ BUG_ON(zone_pcp(z, i)->pcp[1].count != 0);
+ kfree(zone_pcp(z, i));
+ }
+ zone_pcp(z, i) = NULL;
+ }
+}
+#else /* !CONFIG_NUMA */
+static inline int pzone_setup_pagesets(struct zone *z)
+{
+ int batch;
+ int i;
+
+ batch = zone_batchsize(z);
+ for (i = 0; i < NR_CPUS; i++)
+ setup_pageset(zone_pcp(z, i), batch);
+
+ return 0;
+}
+
+static inline void pzone_free_pagesets(struct zone *z)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ BUG_ON(zone_pcp(z, i)->pcp[0].count != 0);
+ BUG_ON(zone_pcp(z, i)->pcp[1].count != 0);
+ }
+}
+#endif /* CONFIG_NUMA */
+
+static inline void pzone_setup_page_flags(struct zone *z,
+ struct page *page)
+{
+ page->flags &= ~(ZONETABLE_MASK << ZONETABLE_PGSHIFT);
+ page->flags |= ((unsigned long)z->pzone_idx << ZONETABLE_PGSHIFT);
+ page->flags |= 1UL << PZONE_BIT_PGSHIFT;
+}
+
+static inline void pzone_restore_page_flags(struct zone *parent,
+ struct page *page)
+{
+ set_page_links(page, zone_idx(parent), parent->zone_pgdat->node_id,
+ page_to_pfn(page));
+ page->flags &= ~(1UL << PZONE_BIT_PGSHIFT);
+}
+
+/*
+ * pzone_bad_range(): implemented for debugging instead of bad_range()
+ * in order to distinguish what causes the crash.
+ */
+static int pzone_bad_range(struct zone *zone, struct page *page)
+{
+ if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+ BUG();
+ if (page_to_pfn(page) < zone->zone_start_pfn)
+ BUG();
+#ifdef CONFIG_HOLES_IN_ZONE
+ if (!pfn_valid(page_to_pfn(page)))
+ BUG();
+#endif
+ if (zone != page_zone(page))
+ BUG();
+ return 0;
+}
+
+static void pzone_drain(void *arg)
+{
+ lru_add_drain();
+}
+
+static void pzone_punt_drain(void *arg)
+{
+ struct work_struct *wp;
+
+ wp = &get_cpu_var(pzone_drain_work);
+ PREPARE_WORK(wp, pzone_drain, arg);
+ /* queue_work() checks whether the work is used or not. */
+ queue_work(pzone_drain_wq, wp);
+ put_cpu_var(pzone_drain_work);
+}
+
+static void pzone_flush_percpu(void *arg)
+{
+ struct zone *z = arg;
+ unsigned long flags;
+ int cpu;
+
+ /*
+ * lru_add_drain() must not be called from interrupt context
+ * (LRU pagevecs are interrupt unsafe).
+ */
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ pzone_punt_drain(arg);
+ __drain_zone_pages(z, cpu);
+ local_irq_restore(flags);
+}
+
+static int pzone_flush_lru(struct zone *z, struct zone *parent,
+ struct list_head *clist, unsigned long *cnr,
+ int block)
+{
+ unsigned long flags;
+ struct page *page;
+ struct list_head list;
+ int n, moved, scan;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock_irqsave(&z->lru_lock, flags);
+ n = isolate_lru_pages(*cnr, clist, &list, &scan);
+ *cnr -= n;
+ spin_unlock_irqrestore(&z->lru_lock, flags);
+
+ moved = 0;
+ while (!list_empty(&list) && n-- > 0) {
+ page = list_entry(list.prev, struct page, lru);
+ list_del(&page->lru);
+
+ if (block) {
+ lock_page(page);
+ wait_on_page_writeback(page);
+ } else {
+ if (TestSetPageLocked(page))
+ goto goaround;
+
+ /* Make sure the writeback bit being kept zero. */
+ if (PageWriteback(page))
+ goto goaround_pagelocked;
+ }
+
+ /* Now we can safely modify the flags field. */
+ pzone_restore_page_flags(parent, page);
+ unlock_page(page);
+
+ spin_lock_irqsave(&parent->lru_lock, flags);
+ if (TestSetPageLRU(page))
+ BUG();
+
+ __put_page(page);
+ if (PageActive(page))
+ add_page_to_active_list(parent, page);
+ else
+ add_page_to_inactive_list(parent, page);
+ spin_unlock_irqrestore(&parent->lru_lock, flags);
+
+ moved++;
+ continue;
+
+goaround_pagelocked:
+ unlock_page(page);
+goaround:
+ spin_lock_irqsave(&z->lru_lock, flags);
+ __put_page(page);
+ if (TestSetPageLRU(page))
+ BUG();
+ list_add(&page->lru, clist);
+ ++*cnr;
+ spin_unlock_irqrestore(&z->lru_lock, flags);
+ }
+
+ return moved;
+}
+
+static void pzone_flush_free_area(struct zone *z)
+{
+ struct free_area *area;
+ struct page *page;
+ struct list_head list;
+ unsigned long flags;
+ int order;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock_irqsave(&z->lock, flags);
+ area = &z->free_area[0];
+ while (!list_empty(&area->free_list)) {
+ page = list_entry(area->free_list.next, struct page, lru);
+ list_del(&page->lru);
+ area->nr_free--;
+ z->free_pages--;
+ z->present_pages--;
+ spin_unlock_irqrestore(&z->lock, flags);
+ pzone_restore_page_flags(z->parent, page);
+ pzone_bad_range(z->parent, page);
+ list_add(&page->lru, &list);
+ free_pages_bulk(z->parent, 1, &list, 0);
+
+ spin_lock_irqsave(&z->lock, flags);
+ }
+
+ BUG_ON(area->nr_free != 0);
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ /* currently pzone only supports order-0 only. do sanity check. */
+ spin_lock_irqsave(&z->lock, flags);
+ for (order = 1; order < MAX_ORDER; order++) {
+ area = &z->free_area[order];
+ BUG_ON(area->nr_free != 0);
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+static int pzone_is_empty(struct zone *z)
+{
+ unsigned long flags;
+ int ret = 0;
+ int i;
+
+ spin_lock_irqsave(&z->lock, flags);
+ ret += z->present_pages;
+ ret += z->free_pages;
+ ret += z->free_area[0].nr_free;
+
+ /* would better use smp_call_function for scanning pcp. */
+ for (i = 0; i < NR_CPUS; i++) {
+#ifdef CONFIG_NUMA
+ if (!zone_pcp(z, i) || (zone_pcp(z, i) == &boot_pageset[i]))
+ continue;
+#endif
+ ret += zone_pcp(z, i)->pcp[0].count;
+ ret += zone_pcp(z, i)->pcp[1].count;
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ spin_lock_irqsave(&z->lru_lock, flags);
+ ret += z->nr_active;
+ ret += z->nr_inactive;
+ spin_unlock_irqrestore(&z->lru_lock, flags);
+
+ return ret == 0;
+}
+
+struct zone *pzone_create(struct zone *parent, char *name, int npages)
+{
+ struct zonelist zonelist;
+ struct zone *z;
+ struct page *page;
+ struct list_head *l;
+ unsigned long flags;
+ int len;
+ int i;
+
+ if (npages > parent->present_pages)
+ return NULL;
+
+ z = kmalloc_node(sizeof(*z), GFP_KERNEL, parent->zone_pgdat->node_id);
+ if (!z)
+ goto bad1;
+ memset(z, 0, sizeof(*z));
+
+ z->present_pages = z->free_pages = npages;
+ z->parent = parent;
+
+ spin_lock_init(&z->lock);
+ spin_lock_init(&z->lru_lock);
+ INIT_LIST_HEAD(&z->active_list);
+ INIT_LIST_HEAD(&z->inactive_list);
+
+ INIT_LIST_HEAD(&z->children);
+ INIT_LIST_HEAD(&z->sibling);
+
+ z->zone_pgdat = parent->zone_pgdat;
+ z->zone_mem_map = parent->zone_mem_map;
+ z->zone_start_pfn = parent->zone_start_pfn;
+ z->spanned_pages = parent->spanned_pages;
+ z->temp_priority = z->prev_priority = DEF_PRIORITY;
+
+ /* use wait_table of parents. */
+ z->wait_table = parent->wait_table;
+ z->wait_table_size = parent->wait_table_size;
+ z->wait_table_bits = parent->wait_table_bits;
+
+ len = strlen(name);
+ z->name = kmalloc_node(len + 1, GFP_KERNEL,
+ parent->zone_pgdat->node_id);
+ if (!z->name)
+ goto bad2;
+ strcpy(z->name, name);
+
+ if (pzone_setup_pagesets(z) < 0)
+ goto bad3;
+
+ /* no lowmem for the pseudo zone. leave lowmem_reserve all-0. */
+
+ zone_init_free_lists(z->zone_pgdat, z, z->spanned_pages);
+
+ /* setup a fake zonelist for allocating pages only from the parent. */
+ memset(&zonelist, 0, sizeof(zonelist));
+ zonelist.zones[0] = parent;
+ for (i = 0; i < npages; i++) {
+ page = __alloc_pages(GFP_KERNEL, 0, &zonelist);
+ if (!page)
+ goto bad4;
+ set_page_count(page, 0);
+ list_add(&page->lru, &z->free_area[0].free_list);
+ z->free_area[0].nr_free++;
+ }
+
+ if (pzone_table_register(z))
+ goto bad4;
+
+ list_for_each(l, &z->free_area[0].free_list) {
+ page = list_entry(l, struct page, lru);
+ pzone_setup_page_flags(z, page);
+ }
+
+ spin_lock_irqsave(&parent->lock, flags);
+ parent->present_pages -= npages;
+ spin_unlock_irqrestore(&parent->lock, flags);
+
+ setup_per_zone_pages_min();
+ setup_per_zone_lowmem_reserve();
+ pzone_parent_register(z, parent);
+
+ return z;
+bad4:
+ while (!list_empty(&z->free_area[0].free_list)) {
+ page = list_entry(z->free_area[0].free_list.next,
+ struct page, lru);
+ list_del(&page->lru);
+ pzone_restore_page_flags(parent, page);
+ set_page_count(page, 1);
+ __free_pages(page, 0);
+ }
+
+ pzone_free_pagesets(z);
+bad3:
+ if (z->name)
+ kfree(z->name);
+bad2:
+ kfree(z);
+bad1:
+ setup_per_zone_pages_min();
+ setup_per_zone_lowmem_reserve();
+
+ return NULL;
+}
+
+#define PZONE_FLUSH_LOOP_COUNT 8
+
+/*
+ * destroying pseudo zone. the caller should make sure that no one references
+ * this pseudo zone.
+ */
+void pzone_destroy(struct zone *z)
+{
+ struct zone *parent;
+ unsigned long flags;
+ unsigned long present;
+ int freed;
+ int retrycnt = 0;
+
+ parent = z->parent;
+ present = z->present_pages;
+ pzone_parent_unregister(z);
+retry:
+ /* drain pages in per-cpu pageset to free_area */
+ smp_call_function(pzone_flush_percpu, z, 0, 1);
+ pzone_flush_percpu(z);
+
+ /* drain pages in the LRU list. */
+ freed = pzone_flush_lru(z, parent, &z->active_list, &z->nr_active,
+ retrycnt > 0);
+ spin_lock_irqsave(&z->lock, flags);
+ z->present_pages -= freed;
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ freed = pzone_flush_lru(z, parent, &z->inactive_list, &z->nr_inactive,
+ retrycnt > 0);
+ spin_lock_irqsave(&z->lock, flags);
+ z->present_pages -= freed;
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ pzone_flush_free_area(z);
+
+ if (!pzone_is_empty(z)) {
+ retrycnt++;
+ if (retrycnt > PZONE_FLUSH_LOOP_COUNT) {
+ BUG();
+ } else {
+ flush_workqueue(pzone_drain_wq);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ);
+ goto retry;
+ }
+ }
+
+ spin_lock_irqsave(&parent->lock, flags);
+ parent->present_pages += present;
+ spin_unlock_irqrestore(&parent->lock, flags);
+
+ flush_workqueue(pzone_drain_wq);
+ pzone_table_unregister(z);
+ pzone_free_pagesets(z);
+ kfree(z->name);
+ kfree(z);
+
+ setup_per_zone_pages_min();
+ setup_per_zone_lowmem_reserve();
+}
+
+extern int shrink_zone_memory(struct zone *zone, int nr_pages);
+
+static int pzone_move_free_pages(struct zone *dst, struct zone *src,
+ int npages)
+{
+ struct zonelist zonelist;
+ struct list_head pagelist;
+ struct page *page;
+ unsigned long flags;
+ int err;
+ int i;
+
+ err = 0;
+ spin_lock_irqsave(&src->lock, flags);
+ if (npages > src->present_pages)
+ err = -ENOMEM;
+ spin_unlock_irqrestore(&src->lock, flags);
+ if (err)
+ return err;
+
+ smp_call_function(pzone_flush_percpu, src, 0, 1);
+ pzone_flush_percpu(src);
+
+ INIT_LIST_HEAD(&pagelist);
+ memset(&zonelist, 0, sizeof(zonelist));
+ zonelist.zones[0] = src;
+ for (i = 0; i < npages; i++) {
+ /*
+ * XXX to prevent myself from being arrested by oom-killer...
+ * should be replaced to the cleaner code.
+ */
+ if (src->free_pages < npages - i) {
+ shrink_zone_memory(src, npages - i);
+ smp_call_function(pzone_flush_percpu, src, 0, 1);
+ pzone_flush_percpu(src);
+ blk_congestion_wait(WRITE, HZ/50);
+ }
+
+ page = __alloc_pages(GFP_KERNEL, 0, &zonelist);
+ if (!page) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ list_add(&page->lru, &pagelist);
+ }
+
+ while (!list_empty(&pagelist)) {
+ page = list_entry(pagelist.next, struct page, lru);
+ list_del(&page->lru);
+ if (zone_is_pseudo(dst))
+ pzone_setup_page_flags(dst, page);
+ else
+ pzone_restore_page_flags(dst, page);
+
+ set_page_count(page, 1);
+ spin_lock_irqsave(&dst->lock, flags);
+ dst->present_pages++;
+ spin_unlock_irqrestore(&dst->lock, flags);
+ __free_pages(page, 0);
+ }
+
+ spin_lock_irqsave(&src->lock, flags);
+ src->present_pages -= npages;
+ spin_unlock_irqrestore(&src->lock, flags);
+
+ return 0;
+bad:
+ while (!list_empty(&pagelist)) {
+ page = list_entry(pagelist.next, struct page, lru);
+ list_del(&page->lru);
+ __free_pages(page, 0);
+ }
+
+ return err;
+}
+
+int pzone_set_numpages(struct zone *z, int npages)
+{
+ struct zone *src, *dst;
+ unsigned long flags;
+ int err;
+ int n;
+
+ /*
+ * This function must not be called simultaneously so far.
+ * The caller should make sure that.
+ */
+ if (z->present_pages == npages) {
+ return 0;
+ } else if (z->present_pages > npages) {
+ n = z->present_pages - npages;
+ src = z;
+ dst = z->parent;
+ } else {
+ n = npages - z->present_pages;
+ src = z->parent;
+ dst = z;
+ }
+
+ /* XXX Preventing oom-killer from complaining */
+ spin_lock_irqsave(&z->lock, flags);
+ z->pages_min = z->pages_low = z->pages_high = 0;
+ spin_unlock_irqrestore(&z->lock, flags);
+
+ err = pzone_move_free_pages(dst, src, n);
+ setup_per_zone_pages_min();
+ setup_per_zone_lowmem_reserve();
+
+ return err;
+}
+
+static int pzone_init(void)
+{
+ struct work_struct *wp;
+ int i;
+
+ pzone_drain_wq = create_workqueue("pzone");
+ if (!pzone_drain_wq) {
+ printk(KERN_ERR "pzone: create_workqueue failed.\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < NR_CPUS; i++) {
+ wp = &per_cpu(pzone_drain_work, i);
+ INIT_WORK(wp, pzone_drain, NULL);
+ }
+
+ for (i = 0; i < MAX_NR_PZONES; i++)
+ list_add_tail(&pzone_table[i].list, &pzone_freelist);
+
+ return 0;
+}
+
+__initcall(pzone_init);
+
+#endif /* CONFIG_PSEUDO_ZONE */
diff -urNp a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c 2006-01-03 12:21:10.000000000 +0900
+++ b/mm/shmem.c 2006-01-19 15:23:00.000000000 +0900
@@ -366,7 +366,7 @@ static swp_entry_t *shmem_swp_alloc(stru
}
spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO | __GFP_NOLRU);
if (page)
set_page_private(page, 0);
spin_lock(&info->lock);
diff -urNp a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c 2006-01-03 12:21:10.000000000 +0900
+++ b/mm/vmscan.c 2006-01-19 15:23:00.000000000 +0900
@@ -591,8 +591,8 @@ keep:
*
* returns how many pages were moved onto *@dst.
*/
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
+int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+ struct list_head *dst, int *scanned)
{
int nr_taken = 0;
struct page *page;
@@ -1047,6 +1047,7 @@ static int balance_pgdat(pg_data_t *pgda
int priority;
int i;
int total_scanned, total_reclaimed;
+ struct zone *zone;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc;
@@ -1060,11 +1061,8 @@ loop_again:
inc_page_state(pageoutrun);
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
+ for_each_zone_in_node(zone, pgdat, pgdat->nr_zones)
zone->temp_priority = DEF_PRIORITY;
- }
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -1082,7 +1080,24 @@ loop_again:
* zone which needs scanning
*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
+#ifdef CONFIG_PSEUDO_ZONE
+ for (zone = pgdat->node_zones + i; zone;
+ zone = pzone_next_in_zone(zone)) {
+ if (zone->present_pages == 0)
+ continue;
+
+ if (zone->all_unreclaimable &&
+ priority != DEF_PRIORITY)
+ continue;
+
+ if (!zone_watermark_ok(zone, order,
+ zone->pages_high, 0, 0)) {
+ end_zone = i;
+ goto scan;
+ }
+ }
+#else /* !CONFIG_PSEUDO_ZONE */
+ zone = pgdat->node_zones + i;
if (zone->present_pages == 0)
continue;
@@ -1096,17 +1111,15 @@ loop_again:
end_zone = i;
goto scan;
}
+#endif /* !CONFIG_PSEUDO_ZONE */
}
goto out;
} else {
end_zone = pgdat->nr_zones - 1;
}
scan:
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
+ for_each_zone_in_node(zone, pgdat, end_zone)
lru_pages += zone->nr_active + zone->nr_inactive;
- }
/*
* Now scan the zone in the dma->highmem direction, stopping
@@ -1117,8 +1130,7 @@ scan:
* pages behind kswapd's direction of progress, which would
* cause too much scanning of the lower zones.
*/
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
+ for_each_zone_in_node(zone, pgdat, end_zone) {
int nr_slab;
if (zone->present_pages == 0)
@@ -1183,11 +1195,9 @@ scan:
break;
}
out:
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
+ for_each_zone_in_node(zone, pgdat, pgdat->nr_zones)
zone->prev_priority = zone->temp_priority;
- }
+
if (!all_zones_ok) {
cond_resched();
goto loop_again;
@@ -1261,7 +1271,9 @@ static int kswapd(void *p)
}
finish_wait(&pgdat->kswapd_wait, &wait);
+ read_lock_nr_zones();
balance_pgdat(pgdat, 0, order);
+ read_unlock_nr_zones();
}
return 0;
}
@@ -1316,6 +1328,35 @@ int shrink_all_memory(int nr_pages)
}
#endif
+#ifdef CONFIG_PSEUDO_ZONE
+int shrink_zone_memory(struct zone *zone, int nr_pages)
+{
+ struct scan_control sc;
+
+ sc.gfp_mask = GFP_KERNEL;
+ sc.may_writepage = 1;
+ sc.may_swap = 1;
+ sc.nr_mapped = read_page_state(nr_mapped);
+ sc.nr_scanned = 0;
+ sc.nr_reclaimed = 0;
+ sc.priority = 0;
+
+ if (nr_pages < SWAP_CLUSTER_MAX)
+ sc.swap_cluster_max = nr_pages;
+ else
+ sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+ sc.nr_to_reclaim = sc.swap_cluster_max;
+ sc.nr_to_scan = sc.swap_cluster_max;
+ sc.nr_mapped = total_memory; /* XXX to make vmscan aggressive */
+ refill_inactive_zone(zone, &sc);
+ sc.nr_to_scan = sc.swap_cluster_max;
+ shrink_cache(zone, &sc);
+
+ return sc.nr_reclaimed;
+}
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2006-01-19 8:04 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-01-19 8:04 [PATCH 0/2] Pzone based CKRM memory resource controller KUROSAWA Takahiro
2006-01-19 8:04 ` KUROSAWA Takahiro [this message]
2006-01-19 18:04 ` [PATCH 1/2] Add the pzone Andy Whitcroft
2006-01-19 23:42 ` KUROSAWA Takahiro
2006-01-20 9:17 ` Andy Whitcroft
2006-01-20 7:08 ` KAMEZAWA Hiroyuki
2006-01-20 8:22 ` KUROSAWA Takahiro
2006-01-20 8:30 ` KAMEZAWA Hiroyuki
2006-01-19 8:04 ` [PATCH 2/2] Add CKRM memory resource controller using pzones KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 0/8] Pzone based CKRM memory resource controller KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 1/8] Add the __GFP_NOLRU flag KUROSAWA Takahiro
2006-01-31 18:18 ` [ckrm-tech] " Dave Hansen
2006-02-01 5:06 ` KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 2/8] Keep the number of zones while zone iterator loop KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 3/8] Add for_each_zone_in_node macro KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 4/8] Extract zone specific routines as functions KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 5/8] Add the pzone_create() function KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 6/8] Add the pzone_destroy() function KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 7/8] Make the number of pages in pzones resizable KUROSAWA Takahiro
2006-01-31 2:30 ` [PATCH 8/8] Add a CKRM memory resource controller using pzones KUROSAWA Takahiro
2006-02-01 2:58 ` [ckrm-tech] [PATCH 0/8] Pzone based CKRM memory resource controller chandra seetharaman
2006-02-01 5:39 ` KUROSAWA Takahiro
2006-02-01 6:16 ` Hirokazu Takahashi
2006-02-02 1:26 ` chandra seetharaman
2006-02-02 3:54 ` KUROSAWA Takahiro
2006-02-03 0:37 ` chandra seetharaman
2006-02-03 0:51 ` KUROSAWA Takahiro
2006-02-03 1:01 ` chandra seetharaman
2006-02-01 3:07 ` chandra seetharaman
2006-02-01 5:54 ` KUROSAWA Takahiro
2006-02-03 1:33 ` KUROSAWA Takahiro
2006-02-03 9:37 ` KUROSAWA Takahiro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20060119080413.24736.27946.sendpatchset@debian \
--to=kurosawa@valinux.co.jp \
--cc=ckrm-tech@lists.sourceforge.net \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox