linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
To: ckrm-tech@lists.sourceforge.net
Cc: linux-mm@kvack.org, KUROSAWA Takahiro <kurosawa@valinux.co.jp>
Subject: [PATCH 6/8] Add the pzone_destroy() function
Date: Tue, 31 Jan 2006 11:30:30 +0900 (JST)	[thread overview]
Message-ID: <20060131023030.7915.57560.sendpatchset@debian> (raw)
In-Reply-To: <20060131023000.7915.71955.sendpatchset@debian>

This patch implements destruction of pzones.  Pages in the destroyed 
pzones return into the parent zone (the zone from that the pzone was 
created).

Signed-off-by: KUROSAWA Takahiro <kurosawa@valinux.co.jp>

---
 include/linux/mmzone.h |    1 
 include/linux/swap.h   |    2 
 mm/page_alloc.c        |  287 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c            |    4 
 4 files changed, 292 insertions(+), 2 deletions(-)

diff -urNp a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h	2006-01-30 14:33:44.000000000 +0900
+++ b/include/linux/mmzone.h	2006-01-30 14:34:39.000000000 +0900
@@ -362,6 +362,7 @@ struct pzone_table {
 extern struct pzone_table pzone_table[];
 
 struct zone *pzone_create(struct zone *z, char *name, int npages);
+void pzone_destroy(struct zone *z);
 
 static inline void zone_init_pzone_link(struct zone *z)
 {
diff -urNp a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h	2006-01-03 12:21:10.000000000 +0900
+++ b/include/linux/swap.h	2006-01-30 11:23:03.000000000 +0900
@@ -171,6 +171,8 @@ extern int rotate_reclaimable_page(struc
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern int isolate_lru_pages(int, struct list_head *, struct list_head *,
+		int *);
 extern int try_to_free_pages(struct zone **, gfp_t);
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 extern int shrink_all_memory(int);
diff -urNp a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	2006-01-30 14:33:44.000000000 +0900
+++ b/mm/page_alloc.c	2006-01-30 14:34:39.000000000 +0900
@@ -2727,6 +2727,9 @@ EXPORT_SYMBOL(pzone_table);
 
 static struct list_head pzone_freelist = LIST_HEAD_INIT(pzone_freelist);
 
+static struct workqueue_struct *pzone_drain_wq;
+static DEFINE_PER_CPU(struct work_struct, pzone_drain_work);
+
 static int pzone_table_register(struct zone *z)
 {
 	struct pzone_table *t;
@@ -2747,6 +2750,18 @@ static int pzone_table_register(struct z
 	return 0;
 }
 
+static void pzone_table_unregister(struct zone *z)
+{
+	struct pzone_table *t;
+	unsigned long flags;
+
+	write_lock_nr_zones(&flags);
+	t = &pzone_table[z->pzone_idx];
+	t->zone = NULL;
+	list_add(&t->list, &pzone_freelist);
+	write_unlock_nr_zones(&flags);
+}
+
 static void pzone_parent_register(struct zone *z, struct zone *parent)
 {
 	unsigned long flags;
@@ -2756,6 +2771,15 @@ static void pzone_parent_register(struct
 	write_unlock_nr_zones(&flags);
 }
 
+static void pzone_parent_unregister(struct zone *z)
+{
+	unsigned long flags;
+
+	write_lock_nr_zones(&flags);
+	list_del(&z->sibling);
+	write_unlock_nr_zones(&flags);
+}
+
 /*
  * pzone alloc/free routines
  */
@@ -2847,6 +2871,194 @@ static inline void pzone_restore_page_fl
 	page->flags &= ~(1UL << PZONE_BIT_PGSHIFT);
 }
 
+/*
+ * pzone_bad_range(): implemented for debugging instead of bad_range()
+ * in order to distinguish what causes the crash.
+ */
+static int pzone_bad_range(struct zone *zone, struct page *page)
+{
+	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+		BUG();
+	if (page_to_pfn(page) < zone->zone_start_pfn)
+		BUG();
+#ifdef CONFIG_HOLES_IN_ZONE
+	if (!pfn_valid(page_to_pfn(page)))
+		BUG();
+#endif
+	if (zone != page_zone(page))
+		BUG();
+	return 0;
+}
+
+static void pzone_drain(void *arg)
+{
+	lru_add_drain();
+}
+
+static void pzone_punt_drain(void *arg)
+{
+	struct work_struct *wp;
+
+	wp = &get_cpu_var(pzone_drain_work);
+	PREPARE_WORK(wp, pzone_drain, arg);
+	/* queue_work() checks whether the work is used or not. */
+	queue_work(pzone_drain_wq, wp);
+	put_cpu_var(pzone_drain_work);
+}
+
+static void pzone_flush_percpu(void *arg)
+{
+	struct zone *z = arg;
+	unsigned long flags;
+	int cpu;
+
+	/*
+	 * lru_add_drain() must not be called from interrupt context
+	 * (LRU pagevecs are interrupt unsafe).
+	 */
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	pzone_punt_drain(arg);
+	__drain_zone_pages(z, cpu);
+	local_irq_restore(flags);
+}
+
+static int pzone_flush_lru(struct zone *z, struct zone *parent,
+			   struct list_head *clist, unsigned long *cnr,
+			   int block)
+{
+	unsigned long flags;
+	struct page *page;
+	struct list_head list;
+	int n, moved, scan;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irqsave(&z->lru_lock, flags);
+	n = isolate_lru_pages(*cnr, clist, &list, &scan);
+	*cnr -= n;
+	spin_unlock_irqrestore(&z->lru_lock, flags);
+
+	moved = 0;
+	while (!list_empty(&list) && n-- > 0) {
+		page = list_entry(list.prev, struct page, lru);
+		list_del(&page->lru);
+
+		if (block) {
+			lock_page(page);
+			wait_on_page_writeback(page);
+		} else {
+			if (TestSetPageLocked(page))
+				goto goaround;
+
+			/* Make sure the writeback bit being kept zero. */
+			if (PageWriteback(page))
+				goto goaround_pagelocked;
+		}
+
+		/* Now we can safely modify the flags field. */
+		pzone_restore_page_flags(parent, page);
+		unlock_page(page);
+
+		spin_lock_irqsave(&parent->lru_lock, flags);
+		if (TestSetPageLRU(page))
+			BUG();
+
+		__put_page(page);
+		if (PageActive(page))
+			add_page_to_active_list(parent, page);
+		else
+			add_page_to_inactive_list(parent, page);
+		spin_unlock_irqrestore(&parent->lru_lock, flags);
+
+		moved++;
+		continue;
+
+goaround_pagelocked:
+		unlock_page(page);
+goaround:
+		spin_lock_irqsave(&z->lru_lock, flags);
+		__put_page(page);
+		if (TestSetPageLRU(page))
+			BUG();
+		list_add(&page->lru, clist);
+		++*cnr;
+		spin_unlock_irqrestore(&z->lru_lock, flags);
+	}
+
+	return moved;
+}
+
+static void pzone_flush_free_area(struct zone *z)
+{
+	struct free_area *area;
+	struct page *page;
+	struct list_head list;
+	unsigned long flags;
+	int order;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irqsave(&z->lock, flags);
+	area = &z->free_area[0];
+	while (!list_empty(&area->free_list)) {
+		page = list_entry(area->free_list.next, struct page, lru);
+		list_del(&page->lru);
+		area->nr_free--;
+		z->free_pages--;
+		z->present_pages--;
+		spin_unlock_irqrestore(&z->lock, flags);
+		pzone_restore_page_flags(z->parent, page);
+		pzone_bad_range(z->parent, page);
+		list_add(&page->lru, &list);
+		free_pages_bulk(z->parent, 1, &list, 0);
+
+		spin_lock_irqsave(&z->lock, flags);
+	}
+
+	BUG_ON(area->nr_free != 0);
+	spin_unlock_irqrestore(&z->lock, flags);
+
+	/* currently pzone only supports order-0 only. do sanity check. */
+	spin_lock_irqsave(&z->lock, flags);
+	for (order = 1; order < MAX_ORDER; order++) {
+		area = &z->free_area[order];
+		BUG_ON(area->nr_free != 0);
+	}
+	spin_unlock_irqrestore(&z->lock, flags);
+}
+
+static int pzone_is_empty(struct zone *z)
+{
+	unsigned long flags;
+	int ret = 0;
+	int i;
+
+	spin_lock_irqsave(&z->lock, flags);
+	ret += z->present_pages;
+	ret += z->free_pages;
+	ret += z->free_area[0].nr_free;
+
+	/* would better use smp_call_function for scanning pcp. */
+	for (i = 0; i < NR_CPUS; i++) {
+#ifdef CONFIG_NUMA
+		if (!zone_pcp(z, i) || (zone_pcp(z, i) == &boot_pageset[i]))
+			continue;
+#endif
+		ret += zone_pcp(z, i)->pcp[0].count;
+		ret += zone_pcp(z, i)->pcp[1].count;
+	}
+	spin_unlock_irqrestore(&z->lock, flags);
+
+	spin_lock_irqsave(&z->lru_lock, flags);
+	ret += z->nr_active;
+	ret += z->nr_inactive;
+	spin_unlock_irqrestore(&z->lru_lock, flags);
+
+	return ret == 0;
+}
+
 struct zone *pzone_create(struct zone *parent, char *name, int npages)
 {
 	struct zonelist zonelist;
@@ -2953,10 +3165,85 @@ bad1:
 	return NULL;
 }
 
+#define PZONE_FLUSH_LOOP_COUNT		8
+
+/*
+ * destroying pseudo zone. the caller should make sure that no one references
+ * this pseudo zone.
+ */
+void pzone_destroy(struct zone *z)
+{
+	struct zone *parent;
+	unsigned long flags;
+	unsigned long present;
+	int freed;
+	int retrycnt = 0;
+
+	parent = z->parent;
+	present = z->present_pages;
+	pzone_parent_unregister(z);
+retry:
+	/* drain pages in per-cpu pageset to free_area */
+	smp_call_function(pzone_flush_percpu, z, 0, 1);
+	pzone_flush_percpu(z);
+	
+	/* drain pages in the LRU list. */
+	freed = pzone_flush_lru(z, parent, &z->active_list, &z->nr_active,
+				retrycnt > 0);
+	spin_lock_irqsave(&z->lock, flags);
+	z->present_pages -= freed;
+	spin_unlock_irqrestore(&z->lock, flags);
+
+	freed = pzone_flush_lru(z, parent, &z->inactive_list, &z->nr_inactive,
+				retrycnt > 0);
+	spin_lock_irqsave(&z->lock, flags);
+	z->present_pages -= freed;
+	spin_unlock_irqrestore(&z->lock, flags);
+
+	pzone_flush_free_area(z);
+
+	if (!pzone_is_empty(z)) {
+		retrycnt++;
+		if (retrycnt > PZONE_FLUSH_LOOP_COUNT) {
+			BUG();
+		} else {
+			flush_workqueue(pzone_drain_wq);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ);
+			goto retry;
+		}
+	}
+
+	spin_lock_irqsave(&parent->lock, flags);
+	parent->present_pages += present;
+	spin_unlock_irqrestore(&parent->lock, flags);
+
+	flush_workqueue(pzone_drain_wq);
+	pzone_table_unregister(z);
+	pzone_free_pagesets(z);
+	kfree(z->name);
+	kfree(z);
+
+	setup_per_zone_pages_min();
+	setup_per_zone_lowmem_reserve();
+}
+
 static int pzone_init(void)
 {
+	struct work_struct *wp;
 	int i;
 
+	pzone_drain_wq = create_workqueue("pzone");
+	if (!pzone_drain_wq) {
+		printk(KERN_ERR "pzone: create_workqueue failed.\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < NR_CPUS; i++) {
+		wp = &per_cpu(pzone_drain_work, i);
+		INIT_WORK(wp, pzone_drain, NULL);
+	}
+
 	for (i = 0; i < MAX_NR_PZONES; i++)
 		list_add_tail(&pzone_table[i].list, &pzone_freelist);
 
diff -urNp a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c	2006-01-30 14:33:44.000000000 +0900
+++ b/mm/vmscan.c	2006-01-30 14:34:39.000000000 +0900
@@ -591,8 +591,8 @@ keep:
  *
  * returns how many pages were moved onto *@dst.
  */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
-			     struct list_head *dst, int *scanned)
+int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+		      struct list_head *dst, int *scanned)
 {
 	int nr_taken = 0;
 	struct page *page;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2006-01-31  2:30 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-01-19  8:04 [PATCH 0/2] Pzone based CKRM memory resource controller KUROSAWA Takahiro
2006-01-19  8:04 ` [PATCH 1/2] Add the pzone KUROSAWA Takahiro
2006-01-19 18:04   ` Andy Whitcroft
2006-01-19 23:42     ` KUROSAWA Takahiro
2006-01-20  9:17       ` Andy Whitcroft
2006-01-20  7:08   ` KAMEZAWA Hiroyuki
2006-01-20  8:22     ` KUROSAWA Takahiro
2006-01-20  8:30       ` KAMEZAWA Hiroyuki
2006-01-19  8:04 ` [PATCH 2/2] Add CKRM memory resource controller using pzones KUROSAWA Takahiro
2006-01-31  2:30 ` [PATCH 0/8] Pzone based CKRM memory resource controller KUROSAWA Takahiro
2006-01-31  2:30   ` [PATCH 1/8] Add the __GFP_NOLRU flag KUROSAWA Takahiro
2006-01-31 18:18     ` [ckrm-tech] " Dave Hansen
2006-02-01  5:06       ` KUROSAWA Takahiro
2006-01-31  2:30   ` [PATCH 2/8] Keep the number of zones while zone iterator loop KUROSAWA Takahiro
2006-01-31  2:30   ` [PATCH 3/8] Add for_each_zone_in_node macro KUROSAWA Takahiro
2006-01-31  2:30   ` [PATCH 4/8] Extract zone specific routines as functions KUROSAWA Takahiro
2006-01-31  2:30   ` [PATCH 5/8] Add the pzone_create() function KUROSAWA Takahiro
2006-01-31  2:30   ` KUROSAWA Takahiro [this message]
2006-01-31  2:30   ` [PATCH 7/8] Make the number of pages in pzones resizable KUROSAWA Takahiro
2006-01-31  2:30   ` [PATCH 8/8] Add a CKRM memory resource controller using pzones KUROSAWA Takahiro
2006-02-01  2:58   ` [ckrm-tech] [PATCH 0/8] Pzone based CKRM memory resource controller chandra seetharaman
2006-02-01  5:39     ` KUROSAWA Takahiro
2006-02-01  6:16       ` Hirokazu Takahashi
2006-02-02  1:26       ` chandra seetharaman
2006-02-02  3:54         ` KUROSAWA Takahiro
2006-02-03  0:37           ` chandra seetharaman
2006-02-03  0:51             ` KUROSAWA Takahiro
2006-02-03  1:01               ` chandra seetharaman
2006-02-01  3:07   ` chandra seetharaman
2006-02-01  5:54     ` KUROSAWA Takahiro
2006-02-03  1:33     ` KUROSAWA Takahiro
2006-02-03  9:37       ` KUROSAWA Takahiro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060131023030.7915.57560.sendpatchset@debian \
    --to=kurosawa@valinux.co.jp \
    --cc=ckrm-tech@lists.sourceforge.net \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox