[RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch)

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch)
@ 2008-10-17 10:48 Daisuke Nishimura
  2008-10-17 10:56 ` [PATCH -mm 1/5] memcg: replace res_counter Daisuke Nishimura
                   ` (5 more replies)
  0 siblings, 6 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-17 10:48 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura

Hi.

I think Kamezawa-san is working on this now, I also made
a trial patch based on Kamezawa-san's v2.

Unfortunately this patch doesn't work(I'll investigate),
but I post it to promote discussion on this topic.

Major changes from v2:
- rebased on memcg-update-v7.
- add a counter to count real swap usage(# of swap entries).
- add arg "use_swap" to try_to_mem_cgroup_pages() and use it sc->may_swap.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
@ 2008-10-17 10:56 ` Daisuke Nishimura
  2008-10-20 19:53   ` Paul Menage
  2008-10-17 10:59 ` [PATCH -mm 2/5] memcg: mem_cgroup private ID Daisuke Nishimura
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-17 10:56 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura

For mem+swap controller, we'll use special counter which has 2 values and
2 limit. Before doing that, replace current res_counter with new mem_counter.

This patch doen't have much meaning other than for clean up before mem+swap
controller. New mem_counter's counter is "unsigned long" and account resource by
# of pages. (I think "unsigned long" is safe under 32bit machines when we count
resource by # of pages rather than bytes.) No changes in user interface.
User interface is in "bytes".

Using "unsigned long long", we have to be nervous to read to temporal value
without lock.

Changelog: v2 -> v3
 - fix trivial bugs
 - rebased on memcg-update-v7

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5b492f..e1c20d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -17,10 +17,9 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/res_counter.h>
+#include <linux/mm.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
-#include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
@@ -116,12 +115,21 @@ struct mem_cgroup_lru_info {
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
+struct mem_counter {
+	unsigned long	pages;
+	unsigned long	pages_limit;
+	unsigned long	max_pages;
+	unsigned long	failcnt;
+	spinlock_t	lock;
+};
+
+
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
-	struct res_counter res;
+	struct mem_counter res;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
@@ -158,6 +166,14 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 	0, /* FORCE */
 };
 
+/* Private File ID for memory resource controller's interface */
+enum {
+	MEMCG_FILE_PAGE_LIMIT,
+	MEMCG_FILE_PAGE_USAGE,
+	MEMCG_FILE_PAGE_MAX_USAGE,
+	MEMCG_FILE_FAILCNT,
+};
+
 /*
  * Always modified under lru lock. Then, not necessary to preempt_disable()
  */
@@ -237,6 +253,81 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 				struct mem_cgroup, css);
 }
 
+/*
+ * counter for memory resource accounting.
+ */
+static void mem_counter_init(struct mem_cgroup *mem)
+{
+	memset(&mem->res, 0, sizeof(mem->res));
+	mem->res.pages_limit = ~0UL;
+	spin_lock_init(&mem->res.lock);
+}
+
+static int mem_counter_charge(struct mem_cgroup *mem, long num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mem->res.lock, flags);
+	if (mem->res.pages + num > mem->res.pages_limit)
+		goto busy_out;
+	mem->res.pages += num;
+	if (mem->res.pages > mem->res.max_pages)
+		mem->res.max_pages = mem->res.pages;
+	spin_unlock_irqrestore(&mem->res.lock, flags);
+	return 0;
+busy_out:
+	mem->res.failcnt++;
+	spin_unlock_irqrestore(&mem->res.lock, flags);
+	return -EBUSY;
+}
+
+static void mem_counter_uncharge_page(struct mem_cgroup *mem, long num)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&mem->res.lock, flags);
+	mem->res.pages -= num;
+	spin_unlock_irqrestore(&mem->res.lock, flags);
+}
+
+static int mem_counter_set_pages_limit(struct mem_cgroup *mem,
+					unsigned long num)
+{
+	unsigned long flags;
+	int ret = -EBUSY;
+
+	spin_lock_irqsave(&mem->res.lock, flags);
+	if (mem->res.pages < num) {
+		mem->res.pages_limit = num;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&mem->res.lock, flags);
+	return ret;
+}
+
+static int mem_counter_check_under_pages_limit(struct mem_cgroup *mem)
+{
+	if (mem->res.pages < mem->res.pages_limit)
+		return 1;
+	return 0;
+}
+
+static void mem_counter_reset(struct mem_cgroup *mem, int member)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&mem->res.lock, flags);
+	switch (member) {
+	case MEMCG_FILE_PAGE_MAX_USAGE:
+		mem->res.max_pages = 0;
+		break;
+	case MEMCG_FILE_FAILCNT:
+		mem->res.failcnt = 0;
+		break;
+	}
+	spin_unlock_irqrestore(&mem->res.lock, flags);
+}
+
+
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
@@ -368,7 +459,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 	 * usage is recorded in bytes. But, here, we assume the number of
 	 * physical pages can be represented by "long" on any arch.
 	 */
-	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
+	total = (long) (mem->res.pages) + 1L;
 	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 	return (int)((rss * 100L) / total);
 }
@@ -692,7 +783,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 	}
 
 
-	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
+	while (unlikely(mem_counter_charge(mem, 1))) {
 		if (!(gfp_mask & __GFP_WAIT))
 			goto nomem;
 
@@ -706,7 +797,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		 * Check the limit again to see if the reclaim reduced the
 		 * current usage of the cgroup before giving up
 		 */
-		if (res_counter_check_under_limit(&mem->res))
+		if (mem_counter_check_under_pages_limit(mem))
 			continue;
 
 		if (!nr_retries--) {
@@ -760,7 +851,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	 */
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
+		mem_counter_uncharge_page(mem, 1);
 		css_put(&mem->css);
 		return;
 	}
@@ -841,7 +932,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 
 	if (spin_trylock(&to_mz->lru_lock)) {
 		__mem_cgroup_remove_list(from_mz, pc);
-		res_counter_uncharge(&from->res, PAGE_SIZE);
+		mem_counter_uncharge_page(from, PAGE_SIZE);
 		pc->mem_cgroup = to;
 		__mem_cgroup_add_list(to_mz, pc, false);
 		ret = 0;
@@ -888,7 +979,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	css_put(&parent->css);
 	/* uncharge if move fails */
 	if (ret)
-		res_counter_uncharge(&parent->res, PAGE_SIZE);
+		mem_counter_uncharge_page(parent, 1);
 
 	return ret;
 }
@@ -1005,7 +1096,7 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 		return;
 	if (!mem)
 		return;
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	mem_counter_uncharge_page(mem, 1);
 	css_put(&mem->css);
 }
 
@@ -1042,7 +1133,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	 * We must uncharge here because "reuse" can occur just after we
 	 * unlock this.
 	 */
-	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	mem_counter_uncharge_page(mem, 1);
 	unlock_page_cgroup(pc);
 	release_page_cgroup(pc);
 	return;
@@ -1174,7 +1265,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 
 	do {
 		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
-		progress += res_counter_check_under_limit(&mem->res);
+		progress += mem_counter_check_under_pages_limit(mem);
 	} while (!progress && --retry);
 
 	css_put(&mem->css);
@@ -1189,8 +1280,12 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
 	int progress;
 	int ret = 0;
+	unsigned long new_lim = (unsigned long)(val >> PAGE_SHIFT);
 
-	while (res_counter_set_limit(&memcg->res, val)) {
+	if (val & (PAGE_SIZE-1))
+		new_lim += 1;
+
+	while (mem_counter_set_pages_limit(memcg, new_lim)) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
@@ -1273,7 +1368,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 
 	shrink = 0;
 move_account:
-	while (mem->res.usage > 0) {
+	while (mem->res.pages > 0) {
 		ret = -EBUSY;
 		if (atomic_read(&mem->css.cgroup->count) > 0)
 			goto out;
@@ -1316,7 +1411,7 @@ try_to_free:
 	}
 	/* try to free all pages in this cgroup */
 	shrink = 1;
-	while (nr_retries && mem->res.usage > 0) {
+	while (nr_retries && mem->res.pages > 0) {
 		int progress;
 		progress = try_to_free_mem_cgroup_pages(mem,
 						  GFP_HIGHUSER_MOVABLE);
@@ -1325,7 +1420,7 @@ try_to_free:
 
 	}
 	/* try move_account...there may be some *locked* pages. */
-	if (mem->res.usage)
+	if (mem->res.pages)
 		goto move_account;
 	ret = 0;
 	goto out;
@@ -1333,13 +1428,43 @@ try_to_free:
 
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
-				    cft->private);
+	unsigned long long ret;
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+	switch (cft->private) {
+	case MEMCG_FILE_PAGE_LIMIT:
+		ret = (unsigned long long)mem->res.pages_limit << PAGE_SHIFT;
+		break;
+	case MEMCG_FILE_PAGE_USAGE:
+		ret = (unsigned long long)mem->res.pages << PAGE_SHIFT;
+		break;
+	case MEMCG_FILE_PAGE_MAX_USAGE:
+		ret = (unsigned long long)mem->res.max_pages << PAGE_SHIFT;
+		break;
+	case MEMCG_FILE_FAILCNT:
+		ret = (unsigned long long)mem->res.failcnt;
+		break;
+	default:
+		BUG();
+	}
+	return ret;
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
+static int call_memparse(const char *buf, unsigned long long *val)
+{
+	char *end;
+
+	*val = memparse((char *)buf, &end);
+	if (*end != '\0')
+		return -EINVAL;
+	*val = PAGE_ALIGN(*val);
+	return 0;
+}
+
+
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
@@ -1348,9 +1473,9 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 	int ret;
 
 	switch (cft->private) {
-	case RES_LIMIT:
+	case MEMCG_FILE_PAGE_LIMIT:
 		/* This function does all necessary parse...reuse it */
-		ret = res_counter_memparse_write_strategy(buffer, &val);
+		ret = call_memparse(buffer, &val);
 		if (!ret)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		break;
@@ -1367,12 +1492,12 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 
 	mem = mem_cgroup_from_cont(cont);
 	switch (event) {
-	case RES_MAX_USAGE:
-		res_counter_reset_max(&mem->res);
-		break;
-	case RES_FAILCNT:
-		res_counter_reset_failcnt(&mem->res);
+	case MEMCG_FILE_PAGE_MAX_USAGE:
+	case MEMCG_FILE_FAILCNT:
+		mem_counter_reset(mem, event);
 		break;
+	default:
+		BUG();
 	}
 	return 0;
 }
@@ -1436,24 +1561,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
-		.private = RES_USAGE,
+		.private = MEMCG_FILE_PAGE_USAGE,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "max_usage_in_bytes",
-		.private = RES_MAX_USAGE,
+		.private = MEMCG_FILE_PAGE_MAX_USAGE,
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
-		.private = RES_LIMIT,
+		.private = MEMCG_FILE_PAGE_LIMIT,
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
-		.private = RES_FAILCNT,
+		.private = MEMCG_FILE_FAILCNT,
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
@@ -1578,7 +1703,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 			return ERR_PTR(-ENOMEM);
 	}
 
-	res_counter_init(&mem->res);
+	mem_counter_init(mem);
 
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH -mm 2/5] memcg: mem_cgroup private ID
  2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
  2008-10-17 10:56 ` [PATCH -mm 1/5] memcg: replace res_counter Daisuke Nishimura
@ 2008-10-17 10:59 ` Daisuke Nishimura
  2008-10-17 11:01 ` [PATCH -mm 3/5] memcg: mem+swap controller Kconfig Daisuke Nishimura, KAMEZAWA Hiroyuki
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-17 10:59 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura

This patch adds a private ID to each memory resource controller.
This is for mem+swap controller.

When we record memcgrp information per each swap entry, rememvering pointer
can consume 8(4) bytes per entry. This is large.

This patch limits the number of memory resource controller to 32768 and
give ID to each controller. (1 bit will be used for flag..)
This can help to save space in future.

ID "0" is used for indicating "invalid" or "not used" ID.
ID "1" is used for root.

(*) 32768 is too small ?

Changelog: v2 -> v3
  - rebased on memcg-update-v7

Changelog:
  - new patch in v2.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e1c20d2..5ef5a5c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
+#define NR_MEMCGRP_ID			(32767)
 
 /*
  * Statistics for memory cgroup.
@@ -141,6 +142,10 @@ struct mem_cgroup {
 	 * statistics.
 	 */
 	struct mem_cgroup_stat stat;
+	/*
+	 * private ID
+	 */
+	unsigned short memcgrp_id;
 };
 static struct mem_cgroup init_mem_cgroup;
 
@@ -327,6 +332,69 @@ static void mem_counter_reset(struct mem_cgroup *mem, int member)
 	spin_unlock_irqrestore(&mem->res.lock, flags);
 }
 
+/*
+ * private ID management for memcg.
+ * set/clear bitmap is called by create/destroy and done under cgroup_mutex.
+ */
+static unsigned long *memcgrp_id_bitmap;
+static struct mem_cgroup **memcgrp_array;
+int nr_memcgrp;
+
+static int memcgrp_id_init(void)
+{
+	void *addr;
+	unsigned long bitmap_size = NR_MEMCGRP_ID/8;
+	unsigned long array_size = NR_MEMCGRP_ID * sizeof(void *);
+
+	addr = kmalloc(bitmap_size, GFP_KERNEL | __GFP_ZERO);
+	if (!addr)
+		return -ENOMEM;
+	memcgrp_array = vmalloc(array_size);
+	if (!memcgrp_array) {
+		kfree(memcgrp_array);
+		return -ENOMEM;
+	}
+	memcgrp_id_bitmap = addr;
+	/* 0 for "invalid id" */
+	set_bit(0, memcgrp_id_bitmap);
+	set_bit(1, memcgrp_id_bitmap);
+	memcgrp_array[0] = NULL;
+	memcgrp_array[1] = &init_mem_cgroup;
+	init_mem_cgroup.memcgrp_id = 1;
+	nr_memcgrp = 1;
+	return 0;
+}
+
+static unsigned int get_new_memcgrp_id(struct mem_cgroup *mem)
+{
+	int id;
+	id = find_first_zero_bit(memcgrp_id_bitmap, NR_MEMCGRP_ID);
+
+	if (id == NR_MEMCGRP_ID - 1)
+		return -ENOSPC;
+	set_bit(id, memcgrp_id_bitmap);
+	memcgrp_array[id] = mem;
+	mem->memcgrp_id = id;
+
+	return 0;
+}
+
+static void free_memcgrp_id(struct mem_cgroup *mem)
+{
+	memcgrp_array[mem->memcgrp_id] = NULL;
+	clear_bit(mem->memcgrp_id , memcgrp_id_bitmap);
+}
+
+/*
+ * please access this while you can convice memcgroup exist.
+ */
+
+static struct mem_cgroup *mem_cgroup_id_lookup(unsigned short id)
+{
+	return memcgrp_array[id];
+}
+
+
 
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
@@ -1691,6 +1759,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	int node;
 
 	if (unlikely((cont->parent) == NULL)) {
+		if (memcgrp_id_init())
+			return ERR_PTR(-ENOMEM);
 		page_cgroup_init();
 		mem = &init_mem_cgroup;
 		cpu_memcgroup_callback(&memcgroup_nb,
@@ -1701,6 +1771,11 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		mem = mem_cgroup_alloc();
 		if (!mem)
 			return ERR_PTR(-ENOMEM);
+
+		if (get_new_memcgrp_id(mem)) {
+			kfree(mem);
+			return ERR_PTR(-ENOSPC);
+		}
 	}
 
 	mem_counter_init(mem);
@@ -1713,8 +1788,10 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 free_out:
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
-	if (cont->parent != NULL)
+	if (cont->parent != NULL) {
+		free_memcgrp_id(mem);
 		mem_cgroup_free(mem);
+	}
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -1731,6 +1808,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 	int node;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 
+	free_memcgrp_id(mem);
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH -mm 3/5] memcg: mem+swap controller Kconfig
  2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
  2008-10-17 10:56 ` [PATCH -mm 1/5] memcg: replace res_counter Daisuke Nishimura
  2008-10-17 10:59 ` [PATCH -mm 2/5] memcg: mem_cgroup private ID Daisuke Nishimura
@ 2008-10-17 11:01 ` Daisuke Nishimura, KAMEZAWA Hiroyuki
  2008-10-17 11:04 ` [PATCH -mm 4/5] memcg: mem+swap counter Daisuke Nishimura
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura, KAMEZAWA Hiroyuki @ 2008-10-17 11:01 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura

Add config for mem+swap controller and defines a helper macro

For stacking several readable size of patches, this marks config
as Broken....later patch will remove this word.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

diff --git a/init/Kconfig b/init/Kconfig
index a404869..14c8205 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -415,6 +415,16 @@ config CGROUP_MEM_RES_CTLR
 	  This config option also selects MM_OWNER config option, which
 	  could in turn add some fork/exit overhead.
 
+config CGROUP_MEM_RES_CTLR_SWAP
+	bool "Memory Resource Controller Swap Extension (Broken)"
+	depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
+	help
+	 Add swap management feature to memory resource controller. By this,
+	 you can control swap consumption per cgroup by limiting the total
+	 amount of memory+swap. Because this records additional informaton
+	 at swap-out, this consumes extra memory. If you use 32bit system or
+	 small memory system, please be careful to enable this.
+
 config CGROUP_MEMRLIMIT_CTLR
 	bool "Memory resource limit controls for cgroups"
 	depends on CGROUPS && RESOURCE_COUNTERS && MMU
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5ef5a5c..023c7bc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -41,6 +41,13 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 #define NR_MEMCGRP_ID			(32767)
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#define do_swap_account	(1)
+#else
+#define do_swap_account	(0)
+#endif
+
+
 /*
  * Statistics for memory cgroup.
  */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH -mm 4/5] memcg: mem+swap counter
  2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
                   ` (2 preceding siblings ...)
  2008-10-17 11:01 ` [PATCH -mm 3/5] memcg: mem+swap controller Kconfig Daisuke Nishimura, KAMEZAWA Hiroyuki
@ 2008-10-17 11:04 ` Daisuke Nishimura
  2008-10-17 11:06 ` [PATCH -mm 5/5] memcg: mem+swap accounting Daisuke Nishimura
  2008-10-20  0:24 ` [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) KAMEZAWA Hiroyuki
  5 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-17 11:04 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura

Add counter for swap accounting to memory resource controller.

This adds 2 counter and 1 limit.
res.swaps, res.disk_swaps and res.memsw_limit.
res.swaps is a counter for # of swap usage, and res.disk_swaps is
for # of swap on disk.

these counter works as

  res.pages + res.disk_swaps < res.memsw_limit.

This means the sum of on_memory_resource and on_swap_resource is limited.
So, a swap is accounted when an anonymous page is charged. By this, the
user can avoid unexpected massive use of swap and kswapd, the global LRU,
is not affected by swap resouce control feature when he try add_to_swap.
...swap is considered to be already accounted as page.

For avoiding too much #ifdefs, this patch uses "do_swap_account" macro.
If config=n, the compiler does good job and ignore some pieces of codes.

This patch doesn't includes swap_accounting infrastructure..then,
CONFIG_CGROUP_MEM_RES_CTLR_SWAP is still broken.

Changelog: v2 -> v3
  - trivial fix
  - rebase on memcg-update-v7
  - removed MEM_CGROUP_CHARGE_TYPE_SWAPOUT, mem_counter_recharge_swap,
    and mem_counter_uncharge_swap. They will be defined in later patch.
  - chaged counter "swaps" to "disk_swaps" and removed the I/F to read it.
  - added new counter "swaps", which means acutual usage of swap,
    and I/F to read it.
  - add I/F to read memswap_usage_in_bytes.
  - allow memsw_limit < pages_limit when pages_limit == ~0UL.
  - add arg "use_swap" to try_to_mem_cgroup_pages() and use it sc->may_swap.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e958419..be0b575 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -213,7 +213,8 @@ static inline void lru_cache_add_active_file(struct page *page)
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
-							gfp_t gfp_mask);
+							gfp_t gfp_mask,
+							int use_swap);
 extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 023c7bc..d712547 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -127,6 +127,9 @@ struct mem_counter {
 	unsigned long	pages;
 	unsigned long	pages_limit;
 	unsigned long	max_pages;
+	unsigned long	swaps;
+	unsigned long	disk_swaps;
+	unsigned long	memsw_limit;
 	unsigned long	failcnt;
 	spinlock_t	lock;
 };
@@ -183,6 +186,9 @@ enum {
 	MEMCG_FILE_PAGE_LIMIT,
 	MEMCG_FILE_PAGE_USAGE,
 	MEMCG_FILE_PAGE_MAX_USAGE,
+	MEMCG_FILE_MEMSW_LIMIT,
+	MEMCG_FILE_MEMSW_USAGE,
+	MEMCG_FILE_SWAP_USAGE,
 	MEMCG_FILE_FAILCNT,
 };
 
@@ -272,6 +278,7 @@ static void mem_counter_init(struct mem_cgroup *mem)
 {
 	memset(&mem->res, 0, sizeof(mem->res));
 	mem->res.pages_limit = ~0UL;
+	mem->res.memsw_limit = ~0UL;
 	spin_lock_init(&mem->res.lock);
 }
 
@@ -282,6 +289,10 @@ static int mem_counter_charge(struct mem_cgroup *mem, long num)
 	spin_lock_irqsave(&mem->res.lock, flags);
 	if (mem->res.pages + num > mem->res.pages_limit)
 		goto busy_out;
+	if (do_swap_account &&
+	    (mem->res.pages + mem->res.disk_swaps + num > mem->res.memsw_limit))
+		goto busy_out;
+
 	mem->res.pages += num;
 	if (mem->res.pages > mem->res.max_pages)
 		mem->res.max_pages = mem->res.pages;
@@ -297,6 +308,8 @@ static void mem_counter_uncharge_page(struct mem_cgroup *mem, long num)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&mem->res.lock, flags);
+	if (WARN_ON(mem->res.pages < num))
+		num = mem->res.pages;
 	mem->res.pages -= num;
 	spin_unlock_irqrestore(&mem->res.lock, flags);
 }
@@ -308,7 +321,9 @@ static int mem_counter_set_pages_limit(struct mem_cgroup *mem,
 	int ret = -EBUSY;
 
 	spin_lock_irqsave(&mem->res.lock, flags);
-	if (mem->res.pages < num) {
+	if (mem->res.memsw_limit < num) {
+		ret = -EINVAL;
+	} else if (mem->res.pages < num) {
 		mem->res.pages_limit = num;
 		ret = 0;
 	}
@@ -316,6 +331,23 @@ static int mem_counter_set_pages_limit(struct mem_cgroup *mem,
 	return ret;
 }
 
+static int
+mem_counter_set_memsw_limit(struct mem_cgroup *mem, unsigned long num)
+{
+	unsigned long flags;
+	int ret = -EBUSY;
+
+	spin_lock_irqsave(&mem->res.lock, flags);
+	if (mem->res.pages_limit != ~0UL && mem->res.pages_limit > num) {
+		ret = -EINVAL;
+	} else if (mem->res.disk_swaps + mem->res.pages < num) {
+		mem->res.memsw_limit = num;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&mem->res.lock, flags);
+	return ret;
+}
+
 static int mem_counter_check_under_pages_limit(struct mem_cgroup *mem)
 {
 	if (mem->res.pages < mem->res.pages_limit)
@@ -323,6 +355,15 @@ static int mem_counter_check_under_pages_limit(struct mem_cgroup *mem)
 	return 0;
 }
 
+static int mem_counter_check_under_memsw_limit(struct mem_cgroup *mem)
+{
+	if (!do_swap_account)
+		return 1;
+	if (mem->res.pages + mem->res.disk_swaps < mem->res.memsw_limit)
+		return 1;
+	return 0;
+}
+
 static void mem_counter_reset(struct mem_cgroup *mem, int member)
 {
 	unsigned long flags;
@@ -339,6 +380,16 @@ static void mem_counter_reset(struct mem_cgroup *mem, int member)
 	spin_unlock_irqrestore(&mem->res.lock, flags);
 }
 
+static int should_use_swap(struct mem_cgroup *mem)
+{
+	if (!do_swap_account)
+		return 1;
+	if (!mem_counter_check_under_pages_limit(mem) &&
+	    mem->res.pages_limit != mem->res.memsw_limit)
+		return 1;
+	return 0;
+}
+
 /*
  * private ID management for memcg.
  * set/clear bitmap is called by create/destroy and done under cgroup_mutex.
@@ -859,10 +910,18 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 
 
 	while (unlikely(mem_counter_charge(mem, 1))) {
+		int progress;
 		if (!(gfp_mask & __GFP_WAIT))
 			goto nomem;
 
-		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask,
+							should_use_swap(mem));
+
+		/*
+		 * When we hit memsw limit, return value of "progress"
+		 * has no meaning. (some pages may just be changed to swap)
+		 */
+		if (mem_counter_check_under_memsw_limit(mem) && progress)
 			continue;
 
 		/*
@@ -872,7 +931,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		 * Check the limit again to see if the reclaim reduced the
 		 * current usage of the cgroup before giving up
 		 */
-		if (mem_counter_check_under_pages_limit(mem))
+		if (!do_swap_account
+		   && mem_counter_check_under_pages_limit(mem))
 			continue;
 
 		if (!nr_retries--) {
@@ -1339,8 +1399,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	rcu_read_unlock();
 
 	do {
-		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
-		progress += mem_counter_check_under_pages_limit(mem);
+		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask,
+							should_use_swap(mem));
+		progress += mem_counter_check_under_pages_limit(mem) &&
+			mem_counter_check_under_memsw_limit(mem);
 	} while (!progress && --retry);
 
 	css_put(&mem->css);
@@ -1349,7 +1411,9 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 	return 0;
 }
 
-int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
+int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
+			    unsigned long long val,
+			    bool memswap)
 {
 
 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
@@ -1360,7 +1424,14 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 	if (val & (PAGE_SIZE-1))
 		new_lim += 1;
 
-	while (mem_counter_set_pages_limit(memcg, new_lim)) {
+	do {
+		if (memswap)
+			ret = mem_counter_set_memsw_limit(memcg, new_lim);
+		else
+			ret = mem_counter_set_pages_limit(memcg, new_lim);
+
+		if (!ret || ret == -EINVAL)
+			break;
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
@@ -1369,10 +1440,12 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 			ret = -EBUSY;
 			break;
 		}
-		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
+							!memswap);
 		if (!progress)
 			retry_count--;
-	}
+	} while (1);
+
 	return ret;
 }
 
@@ -1489,7 +1562,7 @@ try_to_free:
 	while (nr_retries && mem->res.pages > 0) {
 		int progress;
 		progress = try_to_free_mem_cgroup_pages(mem,
-						  GFP_HIGHUSER_MOVABLE);
+						  GFP_HIGHUSER_MOVABLE, 1);
 		if (!progress)
 			nr_retries--;
 
@@ -1519,6 +1592,16 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 	case MEMCG_FILE_FAILCNT:
 		ret = (unsigned long long)mem->res.failcnt;
 		break;
+	case MEMCG_FILE_SWAP_USAGE:
+		ret = (unsigned long long)mem->res.swaps << PAGE_SHIFT;
+		break;
+	case MEMCG_FILE_MEMSW_LIMIT:
+		ret = (unsigned long long)mem->res.memsw_limit << PAGE_SHIFT;
+		break;
+	case MEMCG_FILE_MEMSW_USAGE:
+		ret = (unsigned long long)(mem->res.pages + mem->res.disk_swaps)
+					  << PAGE_SHIFT;
+		break;
 	default:
 		BUG();
 	}
@@ -1545,14 +1628,18 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	unsigned long long val;
+	bool memswap = false;
 	int ret;
 
 	switch (cft->private) {
+	case MEMCG_FILE_MEMSW_LIMIT:
+		memswap = true;
+		/* Fall through */
 	case MEMCG_FILE_PAGE_LIMIT:
 		/* This function does all necessary parse...reuse it */
 		ret = call_memparse(buffer, &val);
 		if (!ret)
-			ret = mem_cgroup_resize_limit(memcg, val);
+			ret = mem_cgroup_resize_limit(memcg, val, memswap);
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
@@ -1665,6 +1752,24 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "stat",
 		.read_map = mem_control_stat_show,
 	},
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+	{
+		.name = "swap_in_bytes",
+		.private = MEMCG_FILE_SWAP_USAGE,
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "memswap_usage_in_bytes",
+		.private = MEMCG_FILE_MEMSW_USAGE,
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "memswap_limit_in_bytes",
+		.private = MEMCG_FILE_MEMSW_LIMIT,
+		.write_string = mem_cgroup_write,
+		.read_u64 = mem_cgroup_read,
+	}
+#endif
 };
 
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33e4319..4007c48 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1756,11 +1756,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-						gfp_t gfp_mask)
+						gfp_t gfp_mask,
+						int use_swap)
 {
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
-		.may_swap = 1,
+		.may_swap = use_swap,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 		.order = 0,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [PATCH -mm 5/5] memcg: mem+swap accounting
  2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
                   ` (3 preceding siblings ...)
  2008-10-17 11:04 ` [PATCH -mm 4/5] memcg: mem+swap counter Daisuke Nishimura
@ 2008-10-17 11:06 ` Daisuke Nishimura
  2008-10-20  0:24 ` [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) KAMEZAWA Hiroyuki
  5 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-17 11:06 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, nishimura

Add Swap accounting feature to memory resource controller.

Accounting is done in following logic.

Swap-Getting:
  - When get_swap_page() is called, swp_entry is marked as to be under
    page->page_cgroup->mem_cgroup, and increment res.swaps.
    on_disk flag of the entry is set off.

Swap-out:
  - When swap-cache is uncharged (fully unmapped), we don't uncharge it.
  - When swap-cache is deleted, we uncharge it from memory, increment
    res.disk_swaps, and turn on on_disk flag.

Swap-in:
  - When add_to_swapcache() is called, we do nothing.
  - When swap is mapped, we charge to memory ,decrement res.disk_swaps,
    and turn off on_disk flag.

SwapCache-Deleting:
  - If the page doesn't have page_cgroup, nothing to do.
  - If the page is still mapped or on radix-tree, nothing to do.
    (This can happen at swapin.)
  - Decrement res.pages, increment res.disk_swaps, and turn on on_disk flag.

Swap-Freeing:
  - Decrement res.swaps, and if on_disk flag is set, decrement res.disk_swaps.

Almost all operations are done against SwapCache, which is Locked.

This patch uses an array to remember the owner of swp_entry. Considering
x86-32, we should avoid to use NORMAL memory and vmalloc() area too much.
This patch uses HIGHMEM to record information under kmap_atomic(KM_USER0).
And information is recored in 2 bytes per 1 swap page.
(memory controller's id is defined as smaller than unsigned short)

Changelog: (v2) -> (v3)
 - count real usage of swaps.
 - uncharge all swaps binded to the group on rmdir.
 - rename member of swap_cgroup.
 - rename swap_cgroup_record_info to __swap_cgroup_info, and define helper
   functions to call it.
 - rename swap_cgroup_account to __swap_cgroup_disk_swap, and define helper
   functions to call it.

Changelog: (preview) -> (v2)
 - removed radix-tree. just use array.
 - removed linked-list.
 - use memcgroup_id rather than pointer.
 - added force_empty (temporal) support.
   This should be reworked in future. (But for now, this works well for us.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

diff --git a/include/linux/swap.h b/include/linux/swap.h
index be0b575..8205044 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -299,7 +299,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct page *);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
@@ -336,6 +336,44 @@ static inline void disable_swap_token(void)
 	put_swap_token(swap_token_mm);
 }
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+extern int swap_cgroup_swapon(int type, unsigned long max_pages);
+extern void swap_cgroup_swapoff(int type);
+extern void swap_cgroup_delete_swap(swp_entry_t entry);
+extern int swap_cgroup_prepare(swp_entry_t ent);
+extern void swap_cgroup_record_info(struct page *, swp_entry_t ent);
+extern void swap_cgroup_delete_swapcache(struct page *page, swp_entry_t entry);
+
+#else
+static inline int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+	return 0;
+}
+static inline void swap_cgroup_swapoff(int type)
+{
+	return;
+}
+static inline void swap_cgroup_delete_swap(swp_entry_t entry)
+{
+	return;
+}
+static inline int swap_cgroup_prepare(swp_entry_t ent)
+{
+	return 0;
+}
+static inline
+void swap_cgroup_record_info(struct page *page, swp_entry_t ent)
+{
+	return;
+}
+static inline
+void swap_cgroup_delete_swapcache(struct page *page, swp_entry_t entry)
+{
+	return;
+}
+#endif
+
+
 #else /* CONFIG_SWAP */
 
 #define total_swap_pages			0
@@ -406,7 +444,7 @@ static inline int remove_exclusive_swap_page_ref(struct page *page)
 	return 0;
 }
 
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct page *page)
 {
 	swp_entry_t entry;
 	entry.val = 0;
diff --git a/init/Kconfig b/init/Kconfig
index 14c8205..4460f46 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -416,7 +416,7 @@ config CGROUP_MEM_RES_CTLR
 	  could in turn add some fork/exit overhead.
 
 config CGROUP_MEM_RES_CTLR_SWAP
-	bool "Memory Resource Controller Swap Extension (Broken)"
+	bool "Memory Resource Controller Swap Extension (EXPERIMENTAL)"
 	depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
 	help
 	 Add swap management feature to memory resource controller. By this,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d712547..e49364c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -34,6 +34,11 @@
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
+#include <linux/swap.h>
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -42,9 +47,29 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define NR_MEMCGRP_ID			(32767)
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+
 #define do_swap_account	(1)
+
+static void swap_cgroup_uncharge_disk_swap(struct page *page);
+static void swap_cgroup_charge_disk_swap(swp_entry_t entry);
+static void swap_cgroup_clean_account(struct mem_cgroup *mem);
+
 #else
+
 #define do_swap_account	(0)
+
+static void swap_cgroup_charge_disk_swap(swp_entry_t entry)
+{
+}
+
+static void
+swap_cgroup_uncharge_disk_swap(struct page *page)
+{
+}
+
+static void swap_cgroup_clean_account(struct mem_cgroup *mem)
+{
+}
 #endif
 
 
@@ -163,6 +188,7 @@ enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
+	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 	NR_CHARGE_TYPE,
 };
@@ -178,6 +204,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
 	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
 	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+	0, /* MEM_CGROUP_CHARGE_TYPE_SWAPOUT */
 	0, /* FORCE */
 };
 
@@ -314,6 +341,50 @@ static void mem_counter_uncharge_page(struct mem_cgroup *mem, long num)
 	spin_unlock_irqrestore(&mem->res.lock, flags);
 }
 
+static void mem_counter_charge_swap(struct mem_cgroup *mem)
+{
+	unsigned long flags;
+	if (do_swap_account) {
+		spin_lock_irqsave(&mem->res.lock, flags);
+		mem->res.swaps += 1;
+		spin_unlock_irqrestore(&mem->res.lock, flags);
+	}
+}
+
+static void mem_counter_uncharge_swap(struct mem_cgroup *mem)
+{
+	unsigned long flags;
+	if (do_swap_account) {
+		spin_lock_irqsave(&mem->res.lock, flags);
+		if (!WARN_ON(mem->res.swaps >= 1))
+			mem->res.swaps -= 1;
+		spin_unlock_irqrestore(&mem->res.lock, flags);
+	}
+}
+
+static void mem_counter_charge_disk_swap(struct mem_cgroup *mem)
+{
+	unsigned long flags;
+	if (do_swap_account) {
+		spin_lock_irqsave(&mem->res.lock, flags);
+		/* res.pages will be decremented later if needed */
+		mem->res.disk_swaps += 1;
+		spin_unlock_irqrestore(&mem->res.lock, flags);
+	}
+}
+
+static void mem_counter_uncharge_disk_swap(struct mem_cgroup *mem)
+{
+	unsigned long flags;
+	if (do_swap_account) {
+		spin_lock_irqsave(&mem->res.lock, flags);
+		/* res.pages has been already incremented if needed */
+		if (!WARN_ON(mem->res.disk_swaps >= 1))
+			mem->res.disk_swaps -= 1;
+		spin_unlock_irqrestore(&mem->res.lock, flags);
+	}
+}
+
 static int mem_counter_set_pages_limit(struct mem_cgroup *mem,
 					unsigned long num)
 {
@@ -1019,6 +1090,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	pc->flags = pcg_default_flags[ctype];
 	unlock_page_cgroup(pc);
 
+	/* We did swap-in, uncharge disk_swap. */
+	if (do_swap_account && PageSwapCache(pc->page))
+		swap_cgroup_uncharge_disk_swap(pc->page);
 	set_page_cgroup_lru(pc);
 	css_put(&mem->css);
 }
@@ -1240,7 +1314,8 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
  * uncharge if !page_mapped(page)
  */
 static void
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+				swp_entry_t entry)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem;
@@ -1256,14 +1331,20 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		return;
 
 	lock_page_cgroup(pc);
-	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
-	     || !PageCgroupUsed(pc)) {
+	if (!PageCgroupUsed(pc)
+	    || PageSwapCache(page)
+	    || ((ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) &&
+		(page_mapped(page) || (page->mapping && !PageAnon(page))))
+		/* This happens at swapin */
+	    || (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))) {
 		/* This happens at race in zap_pte_range() and do_swap_page()*/
 		unlock_page_cgroup(pc);
 		return;
 	}
 	ClearPageCgroupUsed(pc);
 	mem = pc->mem_cgroup;
+	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+		swap_cgroup_charge_disk_swap(entry);
 	/*
 	 * We must uncharge here because "reuse" can occur just after we
 	 * unlock this.
@@ -1281,14 +1362,16 @@ void mem_cgroup_uncharge_page(struct page *page)
 		return;
 	if (page->mapping && !PageAnon(page))
 		return;
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED,
+					(swp_entry_t){0});
 }
 
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE,
+					(swp_entry_t){0});
 }
 
 /*
@@ -1347,9 +1430,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 
-	/* unused page is not on radix-tree now. */
-	if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)
-		__mem_cgroup_uncharge_common(unused, ctype);
+	if (unused)
+		__mem_cgroup_uncharge_common(unused, ctype, (swp_entry_t){0});
 
 	pc = lookup_page_cgroup(target);
 	/*
@@ -1912,6 +1994,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	mem_cgroup_force_empty(mem);
+	swap_cgroup_clean_account(mem);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
@@ -1970,3 +2053,304 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.attach = mem_cgroup_move_task,
 	.early_init = 0,
 };
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/*
+ * swap accounting infrastructure.
+ */
+DEFINE_MUTEX(swap_cgroup_mutex);
+spinlock_t swap_cgroup_lock[MAX_SWAPFILES];
+struct page **swap_cgroup_map[MAX_SWAPFILES];
+unsigned long swap_cgroup_pages[MAX_SWAPFILES];
+
+
+/* This definition is based onf NR_MEM_CGROUP==32768 */
+struct swap_cgroup {
+	unsigned short memcgrp_id:15;
+	unsigned short on_disk:1;
+};
+#define ENTS_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))
+
+/*
+ * Called from get_swap_page().
+ */
+int swap_cgroup_prepare(swp_entry_t ent)
+{
+	struct page *page;
+	unsigned long array_index = swp_offset(ent) / ENTS_PER_PAGE;
+	int type = swp_type(ent);
+	unsigned long flags;
+
+	if (swap_cgroup_map[type][array_index])
+		return 0;
+	page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+	spin_lock_irqsave(&swap_cgroup_lock[type], flags);
+	if (swap_cgroup_map[type][array_index] == NULL) {
+		swap_cgroup_map[type][array_index] = page;
+		page = NULL;
+	}
+	spin_unlock_irqrestore(&swap_cgroup_lock[type], flags);
+
+	if (page)
+		__free_page(page);
+	return 0;
+}
+
+/**
+ * __swap_cgroup_info
+ * @page ..... a page which is in some mem_cgroup.
+ * @entry .... swp_entry of the page. (or old swp_entry of the page)
+ * @delete ... if 0 add entry, if 1 delete entry.
+ */
+static void
+__swap_cgroup_info(struct page *page, swp_entry_t entry, bool del)
+{
+	unsigned long flags;
+	int type = swp_type(entry);
+	unsigned long offset = swp_offset(entry);
+	unsigned long array_index = offset/ENTS_PER_PAGE;
+	unsigned long index = offset & (ENTS_PER_PAGE - 1);
+	struct page *mappage;
+	struct swap_cgroup *map;
+	struct page_cgroup *pc;
+	struct mem_cgroup *mem = NULL;
+
+	if (!del) {
+		VM_BUG_ON(!page);
+
+		pc = lookup_page_cgroup(page);
+		lock_page_cgroup(pc);
+		if (PageCgroupUsed(pc)) {
+			mem = pc->mem_cgroup;
+			css_get(&mem->css);
+		}
+		unlock_page_cgroup(pc);
+
+		if (!mem)
+			goto out;
+	}
+
+	if (!swap_cgroup_map[type])
+		goto out;
+
+	mappage = swap_cgroup_map[type][array_index];
+	VM_BUG_ON(!mappage);
+
+	local_irq_save(flags);
+	map = kmap_atomic(mappage, KM_USER0);
+	if (!del) {
+		if (map[index].memcgrp_id)
+			/* already binded to some group */
+			goto unlock;
+		map[index].memcgrp_id = mem->memcgrp_id;
+		map[index].on_disk = 0;
+		mem_counter_charge_swap(mem);
+	} else {
+		mem = mem_cgroup_id_lookup(map[index].memcgrp_id);
+		if (!mem) {
+			mem_counter_uncharge_swap(mem);
+			if (map[index].on_disk)
+				mem_counter_uncharge_disk_swap(mem);
+			map[index].memcgrp_id = 0;
+			map[index].on_disk = 0;
+		}
+	}
+	kunmap_atomic(mappage, KM_USER0);
+
+unlock:
+	local_irq_restore(flags);
+	if (!del)
+		css_put(&mem->css);
+out:
+	return;
+}
+
+/*
+ * Called from get_swap_page().
+ */
+void swap_cgroup_record_info(struct page *page, swp_entry_t entry)
+{
+	__swap_cgroup_info(page, entry, false);
+	return;
+}
+
+static void swap_cgroup_delete_info(swp_entry_t entry)
+{
+	__swap_cgroup_info(NULL, entry, true);
+	return;
+}
+
+/*
+ * called from swap_entry_free().
+ */
+void swap_cgroup_delete_swap(swp_entry_t entry)
+{
+	swap_cgroup_delete_info(entry);
+	return;
+}
+
+
+/*
+ * set/clear on_disk information of swap_cgroup, and increment/decrement
+ * disk_swaps.
+ */
+static void __swap_cgroup_disk_swap(swp_entry_t entry, bool set)
+{
+	unsigned long flags;
+	int type = swp_type(entry);
+	unsigned long offset = swp_offset(entry);
+	unsigned long array_index = offset/ENTS_PER_PAGE;
+	unsigned long index = offset & (ENTS_PER_PAGE - 1);
+	struct page *mappage;
+	struct swap_cgroup *map;
+	struct mem_cgroup *mem;
+
+	if (!swap_cgroup_map[type])
+		return;
+
+	mappage = swap_cgroup_map[type][array_index];
+	VM_BUG_ON(!mappage);
+
+	local_irq_save(flags);
+	map = kmap_atomic(mappage, KM_USER0);
+	mem = mem_cgroup_id_lookup(map[index].memcgrp_id);
+	if (!mem) {
+		if (set && map[index].on_disk == 0) {
+			map[index].on_disk = 1;
+			mem_counter_charge_disk_swap(mem);
+		} else if (!set && map[index].on_disk == 1) {
+			mem_counter_uncharge_disk_swap(mem);
+			map[index].on_disk = 0;
+		}
+	}
+	kunmap_atomic(mappage, KM_USER0);
+	local_irq_restore(flags);
+
+	return;
+}
+
+static void swap_cgroup_uncharge_disk_swap(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page) };
+
+	VM_BUG_ON(!PageLocked(page));
+	VM_BUG_ON(!PageSwapCache(page));
+
+	__swap_cgroup_disk_swap(entry, false);
+}
+
+static void
+swap_cgroup_charge_disk_swap(swp_entry_t entry)
+{
+	__swap_cgroup_disk_swap(entry, true);
+}
+
+/*
+ * Called from delete_from_swap_cache() then, page is Locked! and
+ * swp_entry is still in use.
+ */
+void swap_cgroup_delete_swapcache(struct page *page, swp_entry_t entry)
+{
+	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT,
+					entry);
+	return;
+}
+
+
+/*
+ * Forget all accounts under swap_cgroup of memcg.
+ * Called from destroying context.
+ */
+static void swap_cgroup_clean_account(struct mem_cgroup *memcg)
+{
+	int type;
+	unsigned long array_index, flags;
+	int index;
+	struct page *mappage;
+	struct swap_cgroup *map;
+
+	if (!memcg->res.swaps)
+		return;
+
+	while (!memcg->res.swaps) {
+		mutex_lock(&swap_cgroup_mutex);
+		for (type = 0; type < MAX_SWAPFILES; type++) {
+			if (swap_cgroup_pages[type] == 0)
+				continue;
+			for (array_index = 0;
+			     array_index < swap_cgroup_pages[type];
+			     array_index++) {
+				mappage = swap_cgroup_map[type][array_index];
+				if (!mappage)
+					continue;
+				local_irq_save(flags);
+				map = kmap_atomic(mappage, KM_USER0);
+				for (index = 0; index < ENTS_PER_PAGE;
+				     index++) {
+					if (map[index].memcgrp_id
+					    == memcg->memcgrp_id) {
+						mem_counter_uncharge_swap(memcg);
+						map[index].memcgrp_id = 0;
+					}
+				}
+				kunmap_atomic(mappage, KM_USER0);
+				local_irq_restore(flags);
+			}
+			mutex_unlock(&swap_cgroup_mutex);
+			cond_resched();
+			mutex_lock(&swap_cgroup_mutex);
+			if (!memcg->res.swaps)
+				break;
+		}
+		mutex_unlock(&swap_cgroup_mutex);
+	}
+}
+
+
+/*
+ * called from swapon().
+ */
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+	void *array;
+	int array_size;
+
+	VM_BUG_ON(swap_cgroup_map[type]);
+
+	array_size = ((max_pages/ENTS_PER_PAGE) + 1) * sizeof(void *);
+
+	array = vmalloc(array_size);
+	if (!array) {
+		printk("swap %d will not be accounted\n", type);
+		return -ENOMEM;
+	}
+	memset(array, 0, array_size);
+	mutex_lock(&swap_cgroup_mutex);
+	swap_cgroup_pages[type] = (max_pages/ENTS_PER_PAGE + 1);
+	swap_cgroup_map[type] = array;
+	mutex_unlock(&swap_cgroup_mutex);
+	spin_lock_init(&swap_cgroup_lock[type]);
+	return 0;
+}
+
+/*
+ * called from swapoff().
+ */
+void swap_cgroup_swapoff(int type)
+{
+	int i;
+	for (i = 0; i < swap_cgroup_pages[type]; i++) {
+		struct page *page = swap_cgroup_map[type][i];
+		if (page)
+			__free_page(page);
+	}
+	mutex_lock(&swap_cgroup_mutex);
+	vfree(swap_cgroup_map[type]);
+	swap_cgroup_map[type] = NULL;
+	mutex_unlock(&swap_cgroup_mutex);
+	swap_cgroup_pages[type] = 0;
+}
+
+#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 72b5f03..686a2b4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1025,7 +1025,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	 * want to check if there's a redundant swappage to be discarded.
 	 */
 	if (wbc->for_reclaim)
-		swap = get_swap_page();
+		swap = get_swap_page(page);
 	else
 		swap.val = 0;
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c90..5515400 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -108,6 +108,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
+	swp_entry_t entry = { .val = page_private(page) };
+
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
@@ -117,6 +119,7 @@ void __delete_from_swap_cache(struct page *page)
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
+	swap_cgroup_delete_swapcache(page, entry);
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
 }
@@ -138,7 +141,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
 	BUG_ON(!PageUptodate(page));
 
 	for (;;) {
-		entry = get_swap_page();
+		entry = get_swap_page(page);
 		if (!entry.val)
 			return 0;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aa68bac..7e1be45 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -184,7 +184,9 @@ no_page:
 	return 0;
 }
 
-swp_entry_t get_swap_page(void)
+/* get_swap_page() calls this */
+static int swap_entry_free(struct swap_info_struct *, swp_entry_t);
+swp_entry_t get_swap_page(struct page *page)
 {
 	struct swap_info_struct *si;
 	pgoff_t offset;
@@ -213,7 +215,19 @@ swp_entry_t get_swap_page(void)
 		swap_list.next = next;
 		offset = scan_swap_map(si);
 		if (offset) {
+			swp_entry_t entry = swp_entry(type, offset);
+
 			spin_unlock(&swap_lock);
+			/*
+			 * swap_cgroup_prepare tries to allocate memory,
+			 * so should be called without holding swap_lock.
+			 */
+			if (swap_cgroup_prepare(entry)) {
+				spin_lock(&swap_lock);
+				swap_entry_free(si, entry);
+				goto noswap;
+			}
+			swap_cgroup_record_info(page, entry);
 			return swp_entry(type, offset);
 		}
 		next = swap_list.next;
@@ -281,8 +295,9 @@ out:
 	return NULL;
 }	
 
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+static int swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
 {
+	unsigned long offset = swp_offset(entry);
 	int count = p->swap_map[offset];
 
 	if (count < SWAP_MAP_MAX) {
@@ -297,6 +312,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			swap_cgroup_delete_swap(entry);
 		}
 	}
 	return count;
@@ -312,7 +328,7 @@ void swap_free(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		swap_entry_free(p, swp_offset(entry));
+		swap_entry_free(p, entry);
 		spin_unlock(&swap_lock);
 	}
 }
@@ -431,7 +447,7 @@ void free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, swp_offset(entry)) == 1) {
+		if (swap_entry_free(p, entry) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
 			if (page && !trylock_page(page)) {
 				page_cache_release(page);
@@ -1356,6 +1372,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+	swap_cgroup_swapoff(type);
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
@@ -1682,6 +1699,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 				1 /* header page */;
 		if (error)
 			goto bad_swap;
+
+		if (swap_cgroup_swapon(type, maxpages)) {
+			printk("We don't enable swap accounting because of"
+				"memory shortage\n");
+		}
 	}
 
 	if (nr_good_pages) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch)
  2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
                   ` (4 preceding siblings ...)
  2008-10-17 11:06 ` [PATCH -mm 5/5] memcg: mem+swap accounting Daisuke Nishimura
@ 2008-10-20  0:24 ` KAMEZAWA Hiroyuki
  2008-10-20  2:53   ` Daisuke Nishimura
  5 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-20  0:24 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: linux-mm, balbir

On Fri, 17 Oct 2008 19:48:04 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> Hi.
> 
> I think Kamezawa-san is working on this now, I also made
> a trial patch based on Kamezawa-san's v2.
> 
yes, I'm now rewriting. I'm now considering whether we can implement easier
protocol or not. But your patch's direction is not far from mine.

> Unfortunately this patch doesn't work(I'll investigate),
> but I post it to promote discussion on this topic.
> 
What kind of problems ? accounting is not correct ?


> Major changes from v2:
> - rebased on memcg-update-v7.
> - add a counter to count real swap usage(# of swap entries).
> - add arg "use_swap" to try_to_mem_cgroup_pages() and use it sc->may_swap.
> 
> 
> Thanks,
> Daisuke Nishimura.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch)
  2008-10-20  0:24 ` [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) KAMEZAWA Hiroyuki
@ 2008-10-20  2:53   ` Daisuke Nishimura
  0 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-20  2:53 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-mm, balbir, d-nishimura, Daisuke Nishimura

On Mon, 20 Oct 2008 09:24:09 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Fri, 17 Oct 2008 19:48:04 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > Hi.
> > 
> > I think Kamezawa-san is working on this now, I also made
> > a trial patch based on Kamezawa-san's v2.
> > 
> yes, I'm now rewriting. I'm now considering whether we can implement easier
> protocol or not. But your patch's direction is not far from mine.
> 
> > Unfortunately this patch doesn't work(I'll investigate),
> > but I post it to promote discussion on this topic.
> > 
> What kind of problems ? accounting is not correct ?
> 
I see "scheduling while atomic" bug(or system hangs) when
trying to swap out some pages.

I'm afraid I take a day off today, I don't have enough
log or information.


Sorry,
Daisuke Nishimura.

> 
> > Major changes from v2:
> > - rebased on memcg-update-v7.
> > - add a counter to count real swap usage(# of swap entries).
> > - add arg "use_swap" to try_to_mem_cgroup_pages() and use it sc->may_swap.
> > 
> > 
> > Thanks,
> > Daisuke Nishimura.
> > 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-17 10:56 ` [PATCH -mm 1/5] memcg: replace res_counter Daisuke Nishimura
@ 2008-10-20 19:53   ` Paul Menage
  2008-10-21  1:14     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 60+ messages in thread
From: Paul Menage @ 2008-10-20 19:53 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: KAMEZAWA Hiroyuki, linux-mm, balbir

Can't we do this in a more generic way, rather than duplicating a lot
of functionality from res_counter?

You're trying to track:

- mem usage
- mem limit
- swap usage
- swap+mem usage
- swap+mem limit

And ensuring that:

- mem usage < mem limit
- swap+mem usage < swap+mem limit

Could we somehow represent this as a pair of resource counters, one
for mem and one for swap+mem that are linked together?

Maybe have an "aggregate" pointer in a res_counter that points to
another res_counter that sums some number of counters; both the mem
and the swap res_counter objects for a cgroup would point to the
mem+swap res_counter for their aggregate. Adjusting the usage of a
counter would also adjust its aggregate (or fail if adjusting the
aggregate failed).

You could potentially use the same mechanism for aggregation across a
parent/child tree as for aggregation across different resources (mem +
swap).

The upside would be that we wouldn't need special res_counter code for
the memory controller, and any other resource controller that wanted
to do aggregation would get it for free.

The downside would be that we'd have to take two locks rather than one
(one for the main counter and one for the aggregate counter) but I
don't think that would have to be a performance hit - since these
locks would tend to be taken together anyway, we can do a
spin_lock_prefetch() on the aggregate lock before we spin on the main
lock, and the aggregate lock should be in cache by the time we get the
main lock (it's most likely that either both were already in cache, or
neither were).

Paul

On Fri, Oct 17, 2008 at 3:56 AM, Daisuke Nishimura
<nishimura@mxp.nes.nec.co.jp> wrote:
> For mem+swap controller, we'll use special counter which has 2 values and
> 2 limit. Before doing that, replace current res_counter with new mem_counter.
>
> This patch doen't have much meaning other than for clean up before mem+swap
> controller. New mem_counter's counter is "unsigned long" and account resource by
> # of pages. (I think "unsigned long" is safe under 32bit machines when we count
> resource by # of pages rather than bytes.) No changes in user interface.
> User interface is in "bytes".
>
> Using "unsigned long long", we have to be nervous to read to temporal value
> without lock.
>
> Changelog: v2 -> v3
>  - fix trivial bugs
>  - rebased on memcg-update-v7
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d5b492f..e1c20d2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -17,10 +17,9 @@
>  * GNU General Public License for more details.
>  */
>
> -#include <linux/res_counter.h>
> +#include <linux/mm.h>
>  #include <linux/memcontrol.h>
>  #include <linux/cgroup.h>
> -#include <linux/mm.h>
>  #include <linux/smp.h>
>  #include <linux/page-flags.h>
>  #include <linux/backing-dev.h>
> @@ -116,12 +115,21 @@ struct mem_cgroup_lru_info {
>  * no reclaim occurs from a cgroup at it's low water mark, this is
>  * a feature that will be implemented much later in the future.
>  */
> +struct mem_counter {
> +       unsigned long   pages;
> +       unsigned long   pages_limit;
> +       unsigned long   max_pages;
> +       unsigned long   failcnt;
> +       spinlock_t      lock;
> +};
> +
> +
>  struct mem_cgroup {
>        struct cgroup_subsys_state css;
>        /*
>         * the counter to account for memory usage
>         */
> -       struct res_counter res;
> +       struct mem_counter res;
>        /*
>         * Per cgroup active and inactive list, similar to the
>         * per zone LRU lists.
> @@ -158,6 +166,14 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
>        0, /* FORCE */
>  };
>
> +/* Private File ID for memory resource controller's interface */
> +enum {
> +       MEMCG_FILE_PAGE_LIMIT,
> +       MEMCG_FILE_PAGE_USAGE,
> +       MEMCG_FILE_PAGE_MAX_USAGE,
> +       MEMCG_FILE_FAILCNT,
> +};
> +
>  /*
>  * Always modified under lru lock. Then, not necessary to preempt_disable()
>  */
> @@ -237,6 +253,81 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>                                struct mem_cgroup, css);
>  }
>
> +/*
> + * counter for memory resource accounting.
> + */
> +static void mem_counter_init(struct mem_cgroup *mem)
> +{
> +       memset(&mem->res, 0, sizeof(mem->res));
> +       mem->res.pages_limit = ~0UL;
> +       spin_lock_init(&mem->res.lock);
> +}
> +
> +static int mem_counter_charge(struct mem_cgroup *mem, long num)
> +{
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&mem->res.lock, flags);
> +       if (mem->res.pages + num > mem->res.pages_limit)
> +               goto busy_out;
> +       mem->res.pages += num;
> +       if (mem->res.pages > mem->res.max_pages)
> +               mem->res.max_pages = mem->res.pages;
> +       spin_unlock_irqrestore(&mem->res.lock, flags);
> +       return 0;
> +busy_out:
> +       mem->res.failcnt++;
> +       spin_unlock_irqrestore(&mem->res.lock, flags);
> +       return -EBUSY;
> +}
> +
> +static void mem_counter_uncharge_page(struct mem_cgroup *mem, long num)
> +{
> +       unsigned long flags;
> +       spin_lock_irqsave(&mem->res.lock, flags);
> +       mem->res.pages -= num;
> +       spin_unlock_irqrestore(&mem->res.lock, flags);
> +}
> +
> +static int mem_counter_set_pages_limit(struct mem_cgroup *mem,
> +                                       unsigned long num)
> +{
> +       unsigned long flags;
> +       int ret = -EBUSY;
> +
> +       spin_lock_irqsave(&mem->res.lock, flags);
> +       if (mem->res.pages < num) {
> +               mem->res.pages_limit = num;
> +               ret = 0;
> +       }
> +       spin_unlock_irqrestore(&mem->res.lock, flags);
> +       return ret;
> +}
> +
> +static int mem_counter_check_under_pages_limit(struct mem_cgroup *mem)
> +{
> +       if (mem->res.pages < mem->res.pages_limit)
> +               return 1;
> +       return 0;
> +}
> +
> +static void mem_counter_reset(struct mem_cgroup *mem, int member)
> +{
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&mem->res.lock, flags);
> +       switch (member) {
> +       case MEMCG_FILE_PAGE_MAX_USAGE:
> +               mem->res.max_pages = 0;
> +               break;
> +       case MEMCG_FILE_FAILCNT:
> +               mem->res.failcnt = 0;
> +               break;
> +       }
> +       spin_unlock_irqrestore(&mem->res.lock, flags);
> +}
> +
> +
>  static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
>                        struct page_cgroup *pc)
>  {
> @@ -368,7 +459,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
>         * usage is recorded in bytes. But, here, we assume the number of
>         * physical pages can be represented by "long" on any arch.
>         */
> -       total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
> +       total = (long) (mem->res.pages) + 1L;
>        rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
>        return (int)((rss * 100L) / total);
>  }
> @@ -692,7 +783,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>        }
>
>
> -       while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
> +       while (unlikely(mem_counter_charge(mem, 1))) {
>                if (!(gfp_mask & __GFP_WAIT))
>                        goto nomem;
>
> @@ -706,7 +797,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
>                 * Check the limit again to see if the reclaim reduced the
>                 * current usage of the cgroup before giving up
>                 */
> -               if (res_counter_check_under_limit(&mem->res))
> +               if (mem_counter_check_under_pages_limit(mem))
>                        continue;
>
>                if (!nr_retries--) {
> @@ -760,7 +851,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
>         */
>        if (unlikely(PageCgroupUsed(pc))) {
>                unlock_page_cgroup(pc);
> -               res_counter_uncharge(&mem->res, PAGE_SIZE);
> +               mem_counter_uncharge_page(mem, 1);
>                css_put(&mem->css);
>                return;
>        }
> @@ -841,7 +932,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
>
>        if (spin_trylock(&to_mz->lru_lock)) {
>                __mem_cgroup_remove_list(from_mz, pc);
> -               res_counter_uncharge(&from->res, PAGE_SIZE);
> +               mem_counter_uncharge_page(from, PAGE_SIZE);
>                pc->mem_cgroup = to;
>                __mem_cgroup_add_list(to_mz, pc, false);
>                ret = 0;
> @@ -888,7 +979,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
>        css_put(&parent->css);
>        /* uncharge if move fails */
>        if (ret)
> -               res_counter_uncharge(&parent->res, PAGE_SIZE);
> +               mem_counter_uncharge_page(parent, 1);
>
>        return ret;
>  }
> @@ -1005,7 +1096,7 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
>                return;
>        if (!mem)
>                return;
> -       res_counter_uncharge(&mem->res, PAGE_SIZE);
> +       mem_counter_uncharge_page(mem, 1);
>        css_put(&mem->css);
>  }
>
> @@ -1042,7 +1133,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>         * We must uncharge here because "reuse" can occur just after we
>         * unlock this.
>         */
> -       res_counter_uncharge(&mem->res, PAGE_SIZE);
> +       mem_counter_uncharge_page(mem, 1);
>        unlock_page_cgroup(pc);
>        release_page_cgroup(pc);
>        return;
> @@ -1174,7 +1265,7 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
>
>        do {
>                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
> -               progress += res_counter_check_under_limit(&mem->res);
> +               progress += mem_counter_check_under_pages_limit(mem);
>        } while (!progress && --retry);
>
>        css_put(&mem->css);
> @@ -1189,8 +1280,12 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
>        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
>        int progress;
>        int ret = 0;
> +       unsigned long new_lim = (unsigned long)(val >> PAGE_SHIFT);
>
> -       while (res_counter_set_limit(&memcg->res, val)) {
> +       if (val & (PAGE_SIZE-1))
> +               new_lim += 1;
> +
> +       while (mem_counter_set_pages_limit(memcg, new_lim)) {
>                if (signal_pending(current)) {
>                        ret = -EINTR;
>                        break;
> @@ -1273,7 +1368,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
>
>        shrink = 0;
>  move_account:
> -       while (mem->res.usage > 0) {
> +       while (mem->res.pages > 0) {
>                ret = -EBUSY;
>                if (atomic_read(&mem->css.cgroup->count) > 0)
>                        goto out;
> @@ -1316,7 +1411,7 @@ try_to_free:
>        }
>        /* try to free all pages in this cgroup */
>        shrink = 1;
> -       while (nr_retries && mem->res.usage > 0) {
> +       while (nr_retries && mem->res.pages > 0) {
>                int progress;
>                progress = try_to_free_mem_cgroup_pages(mem,
>                                                  GFP_HIGHUSER_MOVABLE);
> @@ -1325,7 +1420,7 @@ try_to_free:
>
>        }
>        /* try move_account...there may be some *locked* pages. */
> -       if (mem->res.usage)
> +       if (mem->res.pages)
>                goto move_account;
>        ret = 0;
>        goto out;
> @@ -1333,13 +1428,43 @@ try_to_free:
>
>  static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
>  {
> -       return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
> -                                   cft->private);
> +       unsigned long long ret;
> +       struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
> +
> +       switch (cft->private) {
> +       case MEMCG_FILE_PAGE_LIMIT:
> +               ret = (unsigned long long)mem->res.pages_limit << PAGE_SHIFT;
> +               break;
> +       case MEMCG_FILE_PAGE_USAGE:
> +               ret = (unsigned long long)mem->res.pages << PAGE_SHIFT;
> +               break;
> +       case MEMCG_FILE_PAGE_MAX_USAGE:
> +               ret = (unsigned long long)mem->res.max_pages << PAGE_SHIFT;
> +               break;
> +       case MEMCG_FILE_FAILCNT:
> +               ret = (unsigned long long)mem->res.failcnt;
> +               break;
> +       default:
> +               BUG();
> +       }
> +       return ret;
>  }
>  /*
>  * The user of this function is...
>  * RES_LIMIT.
>  */
> +static int call_memparse(const char *buf, unsigned long long *val)
> +{
> +       char *end;
> +
> +       *val = memparse((char *)buf, &end);
> +       if (*end != '\0')
> +               return -EINVAL;
> +       *val = PAGE_ALIGN(*val);
> +       return 0;
> +}
> +
> +
>  static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
>                            const char *buffer)
>  {
> @@ -1348,9 +1473,9 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
>        int ret;
>
>        switch (cft->private) {
> -       case RES_LIMIT:
> +       case MEMCG_FILE_PAGE_LIMIT:
>                /* This function does all necessary parse...reuse it */
> -               ret = res_counter_memparse_write_strategy(buffer, &val);
> +               ret = call_memparse(buffer, &val);
>                if (!ret)
>                        ret = mem_cgroup_resize_limit(memcg, val);
>                break;
> @@ -1367,12 +1492,12 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
>
>        mem = mem_cgroup_from_cont(cont);
>        switch (event) {
> -       case RES_MAX_USAGE:
> -               res_counter_reset_max(&mem->res);
> -               break;
> -       case RES_FAILCNT:
> -               res_counter_reset_failcnt(&mem->res);
> +       case MEMCG_FILE_PAGE_MAX_USAGE:
> +       case MEMCG_FILE_FAILCNT:
> +               mem_counter_reset(mem, event);
>                break;
> +       default:
> +               BUG();
>        }
>        return 0;
>  }
> @@ -1436,24 +1561,24 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
>  static struct cftype mem_cgroup_files[] = {
>        {
>                .name = "usage_in_bytes",
> -               .private = RES_USAGE,
> +               .private = MEMCG_FILE_PAGE_USAGE,
>                .read_u64 = mem_cgroup_read,
>        },
>        {
>                .name = "max_usage_in_bytes",
> -               .private = RES_MAX_USAGE,
> +               .private = MEMCG_FILE_PAGE_MAX_USAGE,
>                .trigger = mem_cgroup_reset,
>                .read_u64 = mem_cgroup_read,
>        },
>        {
>                .name = "limit_in_bytes",
> -               .private = RES_LIMIT,
> +               .private = MEMCG_FILE_PAGE_LIMIT,
>                .write_string = mem_cgroup_write,
>                .read_u64 = mem_cgroup_read,
>        },
>        {
>                .name = "failcnt",
> -               .private = RES_FAILCNT,
> +               .private = MEMCG_FILE_FAILCNT,
>                .trigger = mem_cgroup_reset,
>                .read_u64 = mem_cgroup_read,
>        },
> @@ -1578,7 +1703,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>                        return ERR_PTR(-ENOMEM);
>        }
>
> -       res_counter_init(&mem->res);
> +       mem_counter_init(mem);
>
>        for_each_node_state(node, N_POSSIBLE)
>                if (alloc_mem_cgroup_per_zone_info(mem, node))
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-20 19:53   ` Paul Menage
@ 2008-10-21  1:14     ` KAMEZAWA Hiroyuki
  2008-10-21  1:29       ` Paul Menage
  2008-10-21  5:30       ` Balbir Singh
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  1:14 UTC (permalink / raw)
  To: Paul Menage; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, 20 Oct 2008 12:53:58 -0700
"Paul Menage" <menage@google.com> wrote:

> Can't we do this in a more generic way, rather than duplicating a lot
> of functionality from res_counter?
> 
> You're trying to track:
> 
> - mem usage
> - mem limit
> - swap usage
> - swap+mem usage
> - swap+mem limit
> 
> And ensuring that:
> 
> - mem usage < mem limit
> - swap+mem usage < swap+mem limit
> 
> Could we somehow represent this as a pair of resource counters, one
> for mem and one for swap+mem that are linked together?
> 

1. It's harmful to increase size of *generic* res_counter. So, modifing
   res_counter only for us is not a choice.
2. Operation should be done under a lock. We have to do 
   -page + swap in atomic, at least.
3. We want to pack all member into a cache-line, multiple res_counter
   is no good.
4. I hate res_counter ;)

> Maybe have an "aggregate" pointer in a res_counter that points to
> another res_counter that sums some number of counters; both the mem
> and the swap res_counter objects for a cgroup would point to the
> mem+swap res_counter for their aggregate. Adjusting the usage of a
> counter would also adjust its aggregate (or fail if adjusting the
> aggregate failed).
> 
It's complicated. 

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  1:14     ` KAMEZAWA Hiroyuki
@ 2008-10-21  1:29       ` Paul Menage
  2008-10-21  1:49         ` KAMEZAWA Hiroyuki
  2008-10-21  5:30       ` Balbir Singh
  1 sibling, 1 reply; 60+ messages in thread
From: Paul Menage @ 2008-10-21  1:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, Oct 20, 2008 at 6:14 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> 1. It's harmful to increase size of *generic* res_counter. So, modifing
>   res_counter only for us is not a choice.

Adding an extra pointer to a per-cgroup structure isn't particularly harmful.

> 2. Operation should be done under a lock. We have to do
>   -page + swap in atomic, at least.

How bad would things really be if you did something like the code below?

if (charge_swap()) {
  uncharge_mem();
} else {
  return -ENOMEM;
}

It's true that this introduces a tiny race whereby a single swap-in
page allocation that might have succeeded could fail, but if you're
that close to the limit your cgroup is heading for an OOM anyway.

> 3. We want to pack all member into a cache-line, multiple res_counter
>   is no good.

As I said previously, if we do a prefetch on the aggregated
res_counter before we touch any fields in the basic counter, then in
theory we should never have to wait on a cache miss on the aggregated
counter - either we have no misses (if both were in cache) or we fetch
both lines concurrently (if neither were in cache). Do you think that
reasoning is invalid?

>
>> Maybe have an "aggregate" pointer in a res_counter that points to
>> another res_counter that sums some number of counters; both the mem
>> and the swap res_counter objects for a cgroup would point to the
>> mem+swap res_counter for their aggregate. Adjusting the usage of a
>> counter would also adjust its aggregate (or fail if adjusting the
>> aggregate failed).
>>
> It's complicated.

Agreed, it's a bit more complicated than defining a new structure and
code that's very reminiscent of res_counter. But it does solve the
problem of aggregating across multiple resource types and multiple
children in a generic way.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  1:29       ` Paul Menage
@ 2008-10-21  1:49         ` KAMEZAWA Hiroyuki
  2008-10-21  2:15           ` Paul Menage
  2008-10-21  2:20           ` Paul Menage
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  1:49 UTC (permalink / raw)
  To: Paul Menage; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, 20 Oct 2008 18:29:28 -0700
"Paul Menage" <menage@google.com> wrote:
> How bad would things really be if you did something like the code below?
> 
> if (charge_swap()) {
>   uncharge_mem();
> } else {
>   return -ENOMEM;
> }
Nitpick, charge_swap() has to always succeed (because mem+swap counter.
So,

  BUG_ON(charge_swap());
or
  charge_swap_and_uncharge_mem();

is necessary. Swap itself has no limit.

> > 3. We want to pack all member into a cache-line, multiple res_counter
> >   is no good.
> 
> As I said previously, if we do a prefetch on the aggregated
> res_counter before we touch any fields in the basic counter, then in
> theory we should never have to wait on a cache miss on the aggregated
> counter - either we have no misses (if both were in cache) or we fetch
> both lines concurrently (if neither were in cache). Do you think that
> reasoning is invalid?

res_counter's operation is very short.
  take a lock => add and compare. => unlock.

So, I wonder there is not enough runway to do prefetch.
(Considering memcg, we can place 2 counters on a cacheline that by putting 2
 counters on aligned line.)


> >
> >> Maybe have an "aggregate" pointer in a res_counter that points to
> >> another res_counter that sums some number of counters; both the mem
> >> and the swap res_counter objects for a cgroup would point to the
> >> mem+swap res_counter for their aggregate. Adjusting the usage of a
> >> counter would also adjust its aggregate (or fail if adjusting the
> >> aggregate failed).
> >>
> > It's complicated.
> 
> Agreed, it's a bit more complicated than defining a new structure and
> code that's very reminiscent of res_counter. But it does solve the
> problem of aggregating across multiple resource types and multiple
> children in a generic way.
> 
If you give me NACK, maybe I have to try that..
(But I believe aggregated child-parent counter will be verrrry slow.)
 
BTW, can we have *unsigned long* version of res_counter ?
memcg doesn't need *unsigned long long*.

And as another discussion, I'd like optimize res_counter by per_cpu.
This will be impossible with counters tied by pointer.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  1:49         ` KAMEZAWA Hiroyuki
@ 2008-10-21  2:15           ` Paul Menage
  2008-10-21  2:50             ` KAMEZAWA Hiroyuki
  2008-10-21  2:20           ` Paul Menage
  1 sibling, 1 reply; 60+ messages in thread
From: Paul Menage @ 2008-10-21  2:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, Oct 20, 2008 at 6:49 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> res_counter's operation is very short.
>  take a lock => add and compare. => unlock.
>
> So, I wonder there is not enough runway to do prefetch.

Sorry, let me be clearer. I'm assuming that since a write operation on
the base counter will generally be accompanied by a write operation on
the aggregate counter, that one of the following is true:

- neither cache line is in a M or E state in our cache. So the
prefetchw on the aggregate counter proceeds in parallel to the stall
on fetching the base counter, and there's no additional delay to
access the aggregate counter.

- both cache lines are in a M or E state in our cache, so there are no
misses on either counter.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  1:49         ` KAMEZAWA Hiroyuki
  2008-10-21  2:15           ` Paul Menage
@ 2008-10-21  2:20           ` Paul Menage
  2008-10-21  3:03             ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 60+ messages in thread
From: Paul Menage @ 2008-10-21  2:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, Oct 20, 2008 at 6:49 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> If you give me NACK, maybe I have to try that..
> (But I believe aggregated child-parent counter will be verrrry slow.)

If it's really impossible to implement the aggregated version without
a significant performance hit then that might be a reason to have a
separate counter class. But I'd rather have a clean generic solution
if we can manage it.

> BTW, can we have *unsigned long* version of res_counter ?
> memcg doesn't need *unsigned long long*.

Potentially - but how often is a read-only operation on the
performance sensitive path? Don't most fast-path operations that
involve a res_counter have an update on the res_counter when they
succeed? In which case you have to pull the cache line into a Modified
state anyway.

>
> And as another discussion, I'd like optimize res_counter by per_cpu.

What were you thinking of doing for this?

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  2:15           ` Paul Menage
@ 2008-10-21  2:50             ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  2:50 UTC (permalink / raw)
  To: Paul Menage; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, 20 Oct 2008 19:15:37 -0700
"Paul Menage" <menage@google.com> wrote:

> On Mon, Oct 20, 2008 at 6:49 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> >
> > res_counter's operation is very short.
> >  take a lock => add and compare. => unlock.
> >
> > So, I wonder there is not enough runway to do prefetch.
> 
> Sorry, let me be clearer. I'm assuming that since a write operation on
> the base counter will generally be accompanied by a write operation on
> the aggregate counter, that one of the following is true:
> 
That's not true.  Mem+Swap contoller has following ops.

   charge:  page++
            compare page < mem_limit.
            compare page+swap < memsw_limit.
   unmap/cache delete:
            page--
   swapout:
            page--
            swap++
   swap_free:
            swap--

....there is no *aggregate* counter. just have limit of total.

But Ok, to get Ack, I'll have to do something.

2 coutner version will be..

   charge: page++   compare page < mem_limit.
           memsw++  counrare memsw < memsw_limit
   unmap/cache-delete:
           page--
   swapout:
           no change.
   swap_free:
           memsw--

No need for *aggregate* counter. just call charge twice.


> - neither cache line is in a M or E state in our cache. So the
> prefetchw on the aggregate counter proceeds in parallel to the stall
> on fetching the base counter, and there's no additional delay to
> access the aggregate counter.
> 


   CPU 0                CPU1
  prefetchw()
                      prefetchw()
  spinlock()
                      spinlock()
  win-spinlock
  cache-miss.
                      cache-miss.

Mem+Swap counter will rerely see this. But if you want to use aggregate counter
for parent/child, you'll see *prefetchw* is no help. 
parent is busier than child.

> - both cache lines are in a M or E state in our cache, so there are no
> misses on either counter.
>
But adds cost of unnecessary prefetch.


Maybe, I'll just use 2 res_coutnters. more updates will be done later if necessary.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  2:20           ` Paul Menage
@ 2008-10-21  3:03             ` KAMEZAWA Hiroyuki
  2008-10-21  6:30               ` Paul Menage
  0 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  3:03 UTC (permalink / raw)
  To: Paul Menage; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, 20 Oct 2008 19:20:16 -0700
"Paul Menage" <menage@google.com> wrote:

> On Mon, Oct 20, 2008 at 6:49 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > If you give me NACK, maybe I have to try that..
> > (But I believe aggregated child-parent counter will be verrrry slow.)
> 
> If it's really impossible to implement the aggregated version without
> a significant performance hit then that might be a reason to have a
> separate counter class. But I'd rather have a clean generic solution
> if we can manage it.
> 
I think we can't do without performance hit. Considering parent<->child counter,
parent is busier than child if usage is propergated from child to parent. So,
prefetch will be just a smal help.

> > BTW, can we have *unsigned long* version of res_counter ?
> > memcg doesn't need *unsigned long long*.
> 
> Potentially - but how often is a read-only operation on the
> performance sensitive path? Don't most fast-path operations that
> involve a res_counter have an update on the res_counter when they
> succeed? In which case you have to pull the cache line into a Modified
> state anyway.
>
I don't like *unsigned long long* just because we have to do following
=
   res->usage < *some number*
=
or
=
   val = res->usage.
=
always under lock because usage is unsigned long long.

> >
> > And as another discussion, I'd like optimize res_counter by per_cpu.
> 
> What were you thinking of doing for this?
> 
just an idea. I believe a process has locality to a res_counter.

==
  struct res_counter_cache {
	unsigned long cache;
	struct res_counter	*res;
  }
  DEFINE_PER_CPU(res_counter_cache, pcp_memcg_res);

  res_counter_charge(struct res_counter *res, struct res_counter_cache *cache, num)
  {
	if (cache->res == res && cache->res >= num) {
		cache->cache -= num;
		return 0;
	} else if (cache->res != res) {
		/* forget cache */
		spin_lock(cache->res);
		cache->res->usage -= cache->cache;
		cache->cache = NULL;
		spin_unlock(cache->res);
  	}
        ....
	cache->cache = res;
  }
  ....
==

But not have a fragile of code.

Thanks,
-Kame













--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  1:14     ` KAMEZAWA Hiroyuki
  2008-10-21  1:29       ` Paul Menage
@ 2008-10-21  5:30       ` Balbir Singh
  2008-10-21  5:39         ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 60+ messages in thread
From: Balbir Singh @ 2008-10-21  5:30 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Paul Menage, Daisuke Nishimura, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Mon, 20 Oct 2008 12:53:58 -0700
> "Paul Menage" <menage@google.com> wrote:
> 
>> Can't we do this in a more generic way, rather than duplicating a lot
>> of functionality from res_counter?
>>
>> You're trying to track:
>>
>> - mem usage
>> - mem limit
>> - swap usage
>> - swap+mem usage
>> - swap+mem limit
>>
>> And ensuring that:
>>
>> - mem usage < mem limit
>> - swap+mem usage < swap+mem limit
>>
>> Could we somehow represent this as a pair of resource counters, one
>> for mem and one for swap+mem that are linked together?
>>
> 
> 1. It's harmful to increase size of *generic* res_counter. So, modifing
>    res_counter only for us is not a choice.
> 2. Operation should be done under a lock. We have to do 
>    -page + swap in atomic, at least.
> 3. We want to pack all member into a cache-line, multiple res_counter
>    is no good.
> 4. I hate res_counter ;)
> 

What do you hate about it? I'll review the patchset in detail (I am currently
unwell, but I'll definitely take a look later).

>> Maybe have an "aggregate" pointer in a res_counter that points to
>> another res_counter that sums some number of counters; both the mem
>> and the swap res_counter objects for a cgroup would point to the
>> mem+swap res_counter for their aggregate. Adjusting the usage of a
>> counter would also adjust its aggregate (or fail if adjusting the
>> aggregate failed).
>>
> It's complicated. 

It seems complicated and for hierarchies we'll do a simple charge up approach
(we've agreed upon the fact that hierarchies are expensive and deep hierarchies
most definitely are)

-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  5:30       ` Balbir Singh
@ 2008-10-21  5:39         ` KAMEZAWA Hiroyuki
  2008-10-21  6:20           ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Li Zefan
  0 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  5:39 UTC (permalink / raw)
  To: balbir; +Cc: Paul Menage, Daisuke Nishimura, linux-mm

On Tue, 21 Oct 2008 11:00:41 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > 1. It's harmful to increase size of *generic* res_counter. So, modifing
> >    res_counter only for us is not a choice.
> > 2. Operation should be done under a lock. We have to do 
> >    -page + swap in atomic, at least.
> > 3. We want to pack all member into a cache-line, multiple res_counter
> >    is no good.
> > 4. I hate res_counter ;)
> > 
> 
> What do you hate about it? I'll review the patchset in detail (I am currently
> unwell, but I'll definitely take a look later).
> 
Just because I feel this kind of *generic* counter can be an obstacle to
do aggressive special optimization for some resource. But I don't want to
argue this now. 

I'll rewrite and avoid to add new mem_counter. (and use 2 res_counters.)

Core logic will not be changed very much but....
Anyway, I'll go to the way which doesn't bother anyone.

BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  5:39         ` KAMEZAWA Hiroyuki
@ 2008-10-21  6:20           ` Li Zefan
  2008-10-21  6:25             ` KAMEZAWA Hiroyuki
  2008-10-21  6:54             ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 60+ messages in thread
From: Li Zefan @ 2008-10-21  6:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

> BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.
> 

But seems this patch causes kernel panic at system boot ... (or maybe one of other
memcg patches?)

I wrote down the panic manually:

BUG: unable to handle kernel NULL pointer dereference at 00000000
IP: page_cgroup_zoneinfo + 0xa

Call Trace:
? mem_cgroup_charge_common + 0x17d
? mem_cgroup_charge
? add_to_page_cache_locked
? add_to_page_cache_lru
? find_or_create_page
? __getblk
? ext3_get_inode_loc
? ext3_iget
? ext3_lookup

Tell me if you need extra information.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  6:20           ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Li Zefan
@ 2008-10-21  6:25             ` KAMEZAWA Hiroyuki
  2008-10-21  6:28               ` Li Zefan
  2008-10-21  6:54             ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  6:25 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

On Tue, 21 Oct 2008 14:20:27 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> > BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.
> > 
> 
> But seems this patch causes kernel panic at system boot ... (or maybe one of other
> memcg patches?)
> 
> I wrote down the panic manually:
> 
> BUG: unable to handle kernel NULL pointer dereference at 00000000
> IP: page_cgroup_zoneinfo + 0xa
> 
> Call Trace:
> ? mem_cgroup_charge_common + 0x17d
> ? mem_cgroup_charge
> ? add_to_page_cache_locked
> ? add_to_page_cache_lru
> ? find_or_create_page
> ? __getblk
> ? ext3_get_inode_loc
> ? ext3_iget
> ? ext3_lookup
> 
> Tell me if you need extra information.
> 
This shows how small testers in -mm ...this is on x86 ?
Could you show me your config ? 
and what happens if cgroup_disable=memory ?

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  6:25             ` KAMEZAWA Hiroyuki
@ 2008-10-21  6:28               ` Li Zefan
  2008-10-21  6:38                 ` Daisuke Nishimura
  0 siblings, 1 reply; 60+ messages in thread
From: Li Zefan @ 2008-10-21  6:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

[-- Attachment #1: Type: text/plain, Size: 910 bytes --]

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 14:20:27 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>>> BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.
>>>
>> But seems this patch causes kernel panic at system boot ... (or maybe one of other
>> memcg patches?)
>>
>> I wrote down the panic manually:
>>
>> BUG: unable to handle kernel NULL pointer dereference at 00000000
>> IP: page_cgroup_zoneinfo + 0xa
>>
>> Call Trace:
>> ? mem_cgroup_charge_common + 0x17d
>> ? mem_cgroup_charge
>> ? add_to_page_cache_locked
>> ? add_to_page_cache_lru
>> ? find_or_create_page
>> ? __getblk
>> ? ext3_get_inode_loc
>> ? ext3_iget
>> ? ext3_lookup
>>
>> Tell me if you need extra information.
>>
> This shows how small testers in -mm ...this is on x86 ?

Yes, x86_32

> Could you show me your config ? 

attached

> and what happens if cgroup_disable=memory ?
> 

then booted up successfully


[-- Attachment #2: kernel_config --]
[-- Type: text/plain, Size: 66260 bytes --]

#
# Automatically generated make config: don't edit
# Linux kernel version: 2.6.27
# Tue Oct 21 08:05:53 2008
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
# CONFIG_X86_64 is not set
CONFIG_X86=y
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
CONFIG_CLOCKSOURCE_WATCHDOG=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_HAVE_LATENCYTOP_SUPPORT=y
CONFIG_FAST_CMPXCHG_LOCAL=y
CONFIG_MMU=y
CONFIG_ZONE_DMA=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_GENERIC_BUG=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
# CONFIG_GENERIC_TIME_VSYSCALL is not set
CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
# CONFIG_ZONE_DMA32 is not set
CONFIG_ARCH_POPULATES_NODE_MAP=y
# CONFIG_AUDIT_ARCH is not set
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_X86_SMP=y
CONFIG_X86_32_SMP=y
CONFIG_X86_HT=y
CONFIG_X86_BIOS_REBOOT=y
CONFIG_X86_TRAMPOLINE=y
CONFIG_KTIME_SCALAR=y
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"

#
# General setup
#
CONFIG_EXPERIMENTAL=y
CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
CONFIG_TASK_XACCT=y
CONFIG_TASK_IO_ACCOUNTING=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_TREE=y
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_CGROUPS=y
CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_NS=y
CONFIG_CGROUP_FREEZER=y
CONFIG_CGROUP_DEVICE=y
CONFIG_CPUSETS=y
CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
CONFIG_GROUP_SCHED=y
CONFIG_FAIR_GROUP_SCHED=y
CONFIG_RT_GROUP_SCHED=y
# CONFIG_USER_SCHED is not set
CONFIG_CGROUP_SCHED=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_RESOURCE_COUNTERS=y
CONFIG_MM_OWNER=y
CONFIG_CGROUP_MEM_RES_CTLR=y
CONFIG_SYSFS_DEPRECATED=y
CONFIG_SYSFS_DEPRECATED_V2=y
CONFIG_PROC_PID_CPUSET=y
CONFIG_RELAY=y
CONFIG_NAMESPACES=y
# CONFIG_UTS_NS is not set
# CONFIG_IPC_NS is not set
CONFIG_USER_NS=y
# CONFIG_PID_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_PCSPKR_PLATFORM=y
CONFIG_COMPAT_BRK=y
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_ANON_INODES=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
CONFIG_EVENTFD=y
CONFIG_SHMEM=y
CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_SLUB_DEBUG=y
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
CONFIG_PROFILING=y
# CONFIG_MARKERS is not set
CONFIG_OPROFILE=m
# CONFIG_OPROFILE_IBS is not set
CONFIG_HAVE_OPROFILE=y
CONFIG_KPROBES=y
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
CONFIG_KRETPROBES=y
CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_HAVE_GENERIC_DMA_COHERENT=y
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
# CONFIG_MODULE_FORCE_LOAD is not set
CONFIG_MODULE_UNLOAD=y
# CONFIG_MODULE_FORCE_UNLOAD is not set
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
CONFIG_KMOD=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
CONFIG_LBD=y
CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_LSF=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set

#
# IO Schedulers
#
CONFIG_IOSCHED_NOOP=y
CONFIG_IOSCHED_AS=m
CONFIG_IOSCHED_DEADLINE=m
CONFIG_IOSCHED_CFQ=y
# CONFIG_DEFAULT_AS is not set
# CONFIG_DEFAULT_DEADLINE is not set
CONFIG_DEFAULT_CFQ=y
# CONFIG_DEFAULT_NOOP is not set
CONFIG_DEFAULT_IOSCHED="cfq"
CONFIG_CLASSIC_RCU=y
CONFIG_FREEZER=y

#
# Processor type and features
#
CONFIG_TICK_ONESHOT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
# CONFIG_X86_PC is not set
# CONFIG_X86_ELAN is not set
# CONFIG_X86_VOYAGER is not set
CONFIG_X86_GENERICARCH=y
# CONFIG_X86_NUMAQ is not set
# CONFIG_X86_SUMMIT is not set
# CONFIG_X86_ES7000 is not set
# CONFIG_X86_BIGSMP is not set
# CONFIG_X86_VSMP is not set
# CONFIG_X86_RDC321X is not set
CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
CONFIG_X86_CYCLONE_TIMER=y
# CONFIG_M386 is not set
# CONFIG_M486 is not set
# CONFIG_M586 is not set
# CONFIG_M586TSC is not set
# CONFIG_M586MMX is not set
CONFIG_M686=y
# CONFIG_MPENTIUMII is not set
# CONFIG_MPENTIUMIII is not set
# CONFIG_MPENTIUMM is not set
# CONFIG_MPENTIUM4 is not set
# CONFIG_MK6 is not set
# CONFIG_MK7 is not set
# CONFIG_MK8 is not set
# CONFIG_MCRUSOE is not set
# CONFIG_MEFFICEON is not set
# CONFIG_MWINCHIPC6 is not set
# CONFIG_MWINCHIP3D is not set
# CONFIG_MGEODEGX1 is not set
# CONFIG_MGEODE_LX is not set
# CONFIG_MCYRIXIII is not set
# CONFIG_MVIAC3_2 is not set
# CONFIG_MVIAC7 is not set
# CONFIG_MPSC is not set
# CONFIG_MCORE2 is not set
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
CONFIG_X86_CMPXCHG=y
CONFIG_X86_L1_CACHE_SHIFT=7
CONFIG_X86_XADD=y
CONFIG_X86_PPRO_FENCE=y
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_INVLPG=y
CONFIG_X86_BSWAP=y
CONFIG_X86_POPAD_OK=y
CONFIG_X86_INTEL_USERCOPY=y
CONFIG_X86_USE_PPRO_CHECKSUM=y
CONFIG_X86_TSC=y
CONFIG_X86_CMOV=y
CONFIG_X86_MINIMUM_CPU_FAMILY=4
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_CPU_SUP_INTEL=y
CONFIG_CPU_SUP_CYRIX_32=y
CONFIG_CPU_SUP_AMD=y
CONFIG_CPU_SUP_CENTAUR_32=y
CONFIG_CPU_SUP_TRANSMETA_32=y
CONFIG_CPU_SUP_UMC_32=y
CONFIG_X86_DS=y
CONFIG_X86_PTRACE_BTS=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
# CONFIG_IOMMU_HELPER is not set
CONFIG_NR_CPUS=32
CONFIG_SCHED_SMT=y
CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_X86_LOCAL_APIC=y
CONFIG_X86_IO_APIC=y
CONFIG_X86_MCE=y
# CONFIG_X86_MCE_NONFATAL is not set
CONFIG_X86_MCE_P4THERMAL=y
CONFIG_VM86=y
# CONFIG_TOSHIBA is not set
# CONFIG_I8K is not set
# CONFIG_X86_REBOOTFIXUPS is not set
# CONFIG_MICROCODE is not set
CONFIG_X86_MSR=m
CONFIG_X86_CPUID=m
# CONFIG_NOHIGHMEM is not set
CONFIG_HIGHMEM4G=y
# CONFIG_HIGHMEM64G is not set
CONFIG_PAGE_OFFSET=0xC0000000
CONFIG_HIGHMEM=y
# CONFIG_ARCH_PHYS_ADDR_T_64BIT is not set
CONFIG_ARCH_FLATMEM_ENABLE=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_ARCH_SELECT_MEMORY_MODEL=y
CONFIG_SELECT_MEMORY_MODEL=y
CONFIG_FLATMEM_MANUAL=y
# CONFIG_DISCONTIGMEM_MANUAL is not set
# CONFIG_SPARSEMEM_MANUAL is not set
CONFIG_FLATMEM=y
CONFIG_FLAT_NODE_MEM_MAP=y
CONFIG_SPARSEMEM_STATIC=y
CONFIG_PAGEFLAGS_EXTENDED=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_RESOURCES_64BIT=y
# CONFIG_PHYS_ADDR_T_64BIT is not set
CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_UNEVICTABLE_LRU=y
CONFIG_HIGHPTE=y
# CONFIG_X86_CHECK_BIOS_CORRUPTION is not set
CONFIG_X86_RESERVE_LOW_64K=y
# CONFIG_MATH_EMULATION is not set
CONFIG_MTRR=y
CONFIG_MTRR_SANITIZER=y
CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0
CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1
# CONFIG_X86_PAT is not set
CONFIG_EFI=y
# CONFIG_IRQBALANCE is not set
CONFIG_SECCOMP=y
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
CONFIG_HZ_1000=y
CONFIG_HZ=1000
CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
CONFIG_RELOCATABLE=y
CONFIG_PHYSICAL_ALIGN=0x400000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
# CONFIG_CMDLINE_BOOL is not set
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y

#
# Power management options
#
CONFIG_PM=y
CONFIG_PM_DEBUG=y
# CONFIG_PM_VERBOSE is not set
CONFIG_CAN_PM_TRACE=y
# CONFIG_PM_TRACE_RTC is not set
CONFIG_PM_SLEEP_SMP=y
CONFIG_PM_SLEEP=y
CONFIG_SUSPEND=y
CONFIG_SUSPEND_FREEZER=y
CONFIG_HIBERNATION=y
CONFIG_PM_STD_PARTITION=""
CONFIG_ACPI=y
CONFIG_ACPI_SLEEP=y
CONFIG_ACPI_PROCFS=y
CONFIG_ACPI_PROCFS_POWER=y
CONFIG_ACPI_SYSFS_POWER=y
# CONFIG_ACPI_PROC_EVENT is not set
CONFIG_ACPI_AC=m
# CONFIG_ACPI_BATTERY is not set
CONFIG_ACPI_BUTTON=m
CONFIG_ACPI_VIDEO=m
CONFIG_ACPI_FAN=y
CONFIG_ACPI_DOCK=y
# CONFIG_ACPI_BAY is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
# CONFIG_ACPI_WMI is not set
# CONFIG_ACPI_ASUS is not set
# CONFIG_ACPI_TOSHIBA is not set
# CONFIG_ACPI_CUSTOM_DSDT is not set
CONFIG_ACPI_BLACKLIST_YEAR=1999
# CONFIG_ACPI_DEBUG is not set
CONFIG_ACPI_EC=y
# CONFIG_ACPI_PCI_SLOT is not set
CONFIG_ACPI_POWER=y
CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
# CONFIG_ACPI_SBS is not set
CONFIG_X86_APM_BOOT=y
CONFIG_APM=y
# CONFIG_APM_IGNORE_USER_SUSPEND is not set
# CONFIG_APM_DO_ENABLE is not set
CONFIG_APM_CPU_IDLE=y
# CONFIG_APM_DISPLAY_BLANK is not set
# CONFIG_APM_ALLOW_INTS is not set
# CONFIG_APM_REAL_MODE_POWER_OFF is not set

#
# CPU Frequency scaling
#
CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_TABLE=y
CONFIG_CPU_FREQ_DEBUG=y
CONFIG_CPU_FREQ_STAT=m
CONFIG_CPU_FREQ_STAT_DETAILS=y
# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
CONFIG_CPU_FREQ_GOV_POWERSAVE=m
CONFIG_CPU_FREQ_GOV_USERSPACE=y
CONFIG_CPU_FREQ_GOV_ONDEMAND=m
CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m

#
# CPUFreq processor drivers
#
# CONFIG_X86_ACPI_CPUFREQ is not set
# CONFIG_X86_POWERNOW_K6 is not set
CONFIG_X86_POWERNOW_K7=y
CONFIG_X86_POWERNOW_K7_ACPI=y
CONFIG_X86_POWERNOW_K8=y
CONFIG_X86_POWERNOW_K8_ACPI=y
# CONFIG_X86_GX_SUSPMOD is not set
# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
CONFIG_X86_SPEEDSTEP_ICH=y
CONFIG_X86_SPEEDSTEP_SMI=y
CONFIG_X86_P4_CLOCKMOD=m
# CONFIG_X86_CPUFREQ_NFORCE2 is not set
CONFIG_X86_LONGRUN=y
# CONFIG_X86_LONGHAUL is not set
# CONFIG_X86_E_POWERSAVER is not set

#
# shared options
#
# CONFIG_X86_ACPI_CPUFREQ_PROC_INTF is not set
CONFIG_X86_SPEEDSTEP_LIB=y
# CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK is not set
CONFIG_CPU_IDLE=y
CONFIG_CPU_IDLE_GOV_LADDER=y
CONFIG_CPU_IDLE_GOV_MENU=y

#
# Bus options (PCI etc.)
#
CONFIG_PCI=y
# CONFIG_PCI_GOBIOS is not set
# CONFIG_PCI_GOMMCONFIG is not set
# CONFIG_PCI_GODIRECT is not set
# CONFIG_PCI_GOOLPC is not set
CONFIG_PCI_GOANY=y
CONFIG_PCI_BIOS=y
CONFIG_PCI_DIRECT=y
CONFIG_PCI_MMCONFIG=y
CONFIG_PCI_DOMAINS=y
CONFIG_PCIEPORTBUS=y
CONFIG_HOTPLUG_PCI_PCIE=m
CONFIG_PCIEAER=y
# CONFIG_PCIEASPM is not set
CONFIG_ARCH_SUPPORTS_MSI=y
CONFIG_PCI_MSI=y
CONFIG_PCI_LEGACY=y
# CONFIG_PCI_DEBUG is not set
CONFIG_HT_IRQ=y
CONFIG_ISA_DMA_API=y
CONFIG_ISA=y
# CONFIG_EISA is not set
# CONFIG_MCA is not set
# CONFIG_SCx200 is not set
# CONFIG_OLPC is not set
CONFIG_PCCARD=y
# CONFIG_PCMCIA_DEBUG is not set
CONFIG_PCMCIA=y
CONFIG_PCMCIA_LOAD_CIS=y
CONFIG_PCMCIA_IOCTL=y
CONFIG_CARDBUS=y

#
# PC-card bridges
#
CONFIG_YENTA=y
CONFIG_YENTA_O2=y
CONFIG_YENTA_RICOH=y
CONFIG_YENTA_TI=y
CONFIG_YENTA_ENE_TUNE=y
CONFIG_YENTA_TOSHIBA=y
# CONFIG_PD6729 is not set
# CONFIG_I82092 is not set
# CONFIG_I82365 is not set
# CONFIG_TCIC is not set
CONFIG_PCMCIA_PROBE=y
CONFIG_PCCARD_NONSTATIC=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_FAKE=m
# CONFIG_HOTPLUG_PCI_COMPAQ is not set
# CONFIG_HOTPLUG_PCI_IBM is not set
CONFIG_HOTPLUG_PCI_ACPI=m
CONFIG_HOTPLUG_PCI_ACPI_IBM=m
# CONFIG_HOTPLUG_PCI_CPCI is not set
# CONFIG_HOTPLUG_PCI_SHPC is not set

#
# Executable file formats / Emulations
#
CONFIG_BINFMT_ELF=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_HAVE_AOUT=y
# CONFIG_BINFMT_AOUT is not set
CONFIG_BINFMT_MISC=y
CONFIG_NET=y

#
# Networking options
#
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
# CONFIG_NET_KEY is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_ASK_IP_FIB_HASH=y
# CONFIG_IP_FIB_TRIE is not set
CONFIG_IP_FIB_HASH=y
CONFIG_IP_MULTIPLE_TABLES=y
CONFIG_IP_ROUTE_MULTIPATH=y
CONFIG_IP_ROUTE_VERBOSE=y
# CONFIG_IP_PNP is not set
CONFIG_NET_IPIP=m
# CONFIG_NET_IPGRE is not set
CONFIG_IP_MROUTE=y
CONFIG_IP_PIMSM_V1=y
CONFIG_IP_PIMSM_V2=y
# CONFIG_ARPD is not set
CONFIG_SYN_COOKIES=y
# CONFIG_INET_AH is not set
# CONFIG_INET_ESP is not set
# CONFIG_INET_IPCOMP is not set
# CONFIG_INET_XFRM_TUNNEL is not set
CONFIG_INET_TUNNEL=m
# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
# CONFIG_INET_XFRM_MODE_TUNNEL is not set
# CONFIG_INET_XFRM_MODE_BEET is not set
CONFIG_INET_LRO=m
CONFIG_INET_DIAG=m
CONFIG_INET_TCP_DIAG=m
CONFIG_TCP_CONG_ADVANCED=y
CONFIG_TCP_CONG_BIC=m
CONFIG_TCP_CONG_CUBIC=y
# CONFIG_TCP_CONG_WESTWOOD is not set
# CONFIG_TCP_CONG_HTCP is not set
CONFIG_TCP_CONG_HSTCP=m
CONFIG_TCP_CONG_HYBLA=m
# CONFIG_TCP_CONG_VEGAS is not set
CONFIG_TCP_CONG_SCALABLE=m
CONFIG_TCP_CONG_LP=m
# CONFIG_TCP_CONG_VENO is not set
# CONFIG_TCP_CONG_YEAH is not set
CONFIG_TCP_CONG_ILLINOIS=m
# CONFIG_DEFAULT_BIC is not set
CONFIG_DEFAULT_CUBIC=y
# CONFIG_DEFAULT_HTCP is not set
# CONFIG_DEFAULT_VEGAS is not set
# CONFIG_DEFAULT_WESTWOOD is not set
# CONFIG_DEFAULT_RENO is not set
CONFIG_DEFAULT_TCP_CONG="cubic"
# CONFIG_TCP_MD5SIG is not set
# CONFIG_IPV6 is not set
CONFIG_NETWORK_SECMARK=y
# CONFIG_NETFILTER is not set
# CONFIG_IP_DCCP is not set
# CONFIG_IP_SCTP is not set
# CONFIG_TIPC is not set
CONFIG_ATM=m
CONFIG_ATM_CLIP=m
# CONFIG_ATM_CLIP_NO_ICMP is not set
CONFIG_ATM_LANE=m
# CONFIG_ATM_MPOA is not set
CONFIG_ATM_BR2684=m
# CONFIG_ATM_BR2684_IPFILTER is not set
CONFIG_STP=m
CONFIG_BRIDGE=m
# CONFIG_NET_DSA is not set
CONFIG_VLAN_8021Q=m
# CONFIG_VLAN_8021Q_GVRP is not set
# CONFIG_DECNET is not set
CONFIG_LLC=m
# CONFIG_LLC2 is not set
CONFIG_IPX=m
# CONFIG_IPX_INTERN is not set
# CONFIG_ATALK is not set
# CONFIG_X25 is not set
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
CONFIG_WAN_ROUTER=m
CONFIG_NET_SCHED=y

#
# Queueing/Scheduling
#
CONFIG_NET_SCH_CBQ=m
# CONFIG_NET_SCH_HTB is not set
# CONFIG_NET_SCH_HFSC is not set
CONFIG_NET_SCH_ATM=m
CONFIG_NET_SCH_PRIO=m
# CONFIG_NET_SCH_MULTIQ is not set
CONFIG_NET_SCH_RED=m
# CONFIG_NET_SCH_SFQ is not set
# CONFIG_NET_SCH_TEQL is not set
# CONFIG_NET_SCH_TBF is not set
# CONFIG_NET_SCH_GRED is not set
# CONFIG_NET_SCH_DSMARK is not set
CONFIG_NET_SCH_NETEM=m
CONFIG_NET_SCH_INGRESS=m

#
# Classification
#
CONFIG_NET_CLS=y
CONFIG_NET_CLS_BASIC=m
CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_ROUTE=y
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
CONFIG_CLS_U32_PERF=y
CONFIG_CLS_U32_MARK=y
CONFIG_NET_CLS_RSVP=m
# CONFIG_NET_CLS_RSVP6 is not set
# CONFIG_NET_CLS_FLOW is not set
CONFIG_NET_EMATCH=y
CONFIG_NET_EMATCH_STACK=32
CONFIG_NET_EMATCH_CMP=m
CONFIG_NET_EMATCH_NBYTE=m
CONFIG_NET_EMATCH_U32=m
CONFIG_NET_EMATCH_META=m
CONFIG_NET_EMATCH_TEXT=m
CONFIG_NET_CLS_ACT=y
CONFIG_NET_ACT_POLICE=y
CONFIG_NET_ACT_GACT=m
CONFIG_GACT_PROB=y
CONFIG_NET_ACT_MIRRED=m
# CONFIG_NET_ACT_NAT is not set
# CONFIG_NET_ACT_PEDIT is not set
# CONFIG_NET_ACT_SIMP is not set
# CONFIG_NET_ACT_SKBEDIT is not set
CONFIG_NET_CLS_IND=y
CONFIG_NET_SCH_FIFO=y

#
# Network testing
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_NET_TCPPROBE is not set
# CONFIG_HAMRADIO is not set
# CONFIG_CAN is not set
CONFIG_IRDA=m

#
# IrDA protocols
#
CONFIG_IRLAN=m
CONFIG_IRNET=m
CONFIG_IRCOMM=m
# CONFIG_IRDA_ULTRA is not set

#
# IrDA options
#
CONFIG_IRDA_CACHE_LAST_LSAP=y
CONFIG_IRDA_FAST_RR=y
# CONFIG_IRDA_DEBUG is not set

#
# Infrared-port device drivers
#

#
# SIR device drivers
#
CONFIG_IRTTY_SIR=m

#
# Dongle support
#
CONFIG_DONGLE=y
CONFIG_ESI_DONGLE=m
# CONFIG_ACTISYS_DONGLE is not set
# CONFIG_TEKRAM_DONGLE is not set
# CONFIG_TOIM3232_DONGLE is not set
# CONFIG_LITELINK_DONGLE is not set
# CONFIG_MA600_DONGLE is not set
# CONFIG_GIRBIL_DONGLE is not set
# CONFIG_MCP2120_DONGLE is not set
# CONFIG_OLD_BELKIN_DONGLE is not set
# CONFIG_ACT200L_DONGLE is not set
# CONFIG_KINGSUN_DONGLE is not set
# CONFIG_KSDAZZLE_DONGLE is not set
# CONFIG_KS959_DONGLE is not set

#
# FIR device drivers
#
CONFIG_USB_IRDA=m
# CONFIG_SIGMATEL_FIR is not set
# CONFIG_NSC_FIR is not set
# CONFIG_WINBOND_FIR is not set
# CONFIG_TOSHIBA_FIR is not set
# CONFIG_SMC_IRCC_FIR is not set
# CONFIG_ALI_FIR is not set
# CONFIG_VLSI_FIR is not set
CONFIG_VIA_FIR=m
# CONFIG_MCS_FIR is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
# CONFIG_WIRELESS is not set
# CONFIG_RFKILL is not set
# CONFIG_NET_9P is not set

#
# Device Drivers
#

#
# Generic Driver Options
#
CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
CONFIG_FIRMWARE_IN_KERNEL=y
CONFIG_EXTRA_FIRMWARE=""
# CONFIG_DEBUG_DRIVER is not set
CONFIG_DEBUG_DEVRES=y
# CONFIG_SYS_HYPERVISOR is not set
CONFIG_CONNECTOR=y
CONFIG_PROC_EVENTS=y
CONFIG_MTD=m
# CONFIG_MTD_DEBUG is not set
CONFIG_MTD_CONCAT=m
CONFIG_MTD_PARTITIONS=y
# CONFIG_MTD_REDBOOT_PARTS is not set
# CONFIG_MTD_AR7_PARTS is not set

#
# User Modules And Translation Layers
#
CONFIG_MTD_CHAR=m
CONFIG_MTD_BLKDEVS=m
CONFIG_MTD_BLOCK=m
CONFIG_MTD_BLOCK_RO=m
# CONFIG_FTL is not set
# CONFIG_NFTL is not set
# CONFIG_INFTL is not set
# CONFIG_RFD_FTL is not set
# CONFIG_SSFDC is not set
# CONFIG_MTD_OOPS is not set

#
# RAM/ROM/Flash chip drivers
#
CONFIG_MTD_CFI=m
CONFIG_MTD_JEDECPROBE=m
CONFIG_MTD_GEN_PROBE=m
# CONFIG_MTD_CFI_ADV_OPTIONS is not set
CONFIG_MTD_MAP_BANK_WIDTH_1=y
CONFIG_MTD_MAP_BANK_WIDTH_2=y
CONFIG_MTD_MAP_BANK_WIDTH_4=y
# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
CONFIG_MTD_CFI_I1=y
CONFIG_MTD_CFI_I2=y
# CONFIG_MTD_CFI_I4 is not set
# CONFIG_MTD_CFI_I8 is not set
CONFIG_MTD_CFI_INTELEXT=m
# CONFIG_MTD_CFI_AMDSTD is not set
# CONFIG_MTD_CFI_STAA is not set
CONFIG_MTD_CFI_UTIL=m
CONFIG_MTD_RAM=m
CONFIG_MTD_ROM=m
CONFIG_MTD_ABSENT=m

#
# Mapping drivers for chip access
#
CONFIG_MTD_COMPLEX_MAPPINGS=y
# CONFIG_MTD_PHYSMAP is not set
# CONFIG_MTD_SC520CDP is not set
# CONFIG_MTD_NETSC520 is not set
# CONFIG_MTD_TS5500 is not set
# CONFIG_MTD_SBC_GXX is not set
# CONFIG_MTD_AMD76XROM is not set
# CONFIG_MTD_ICHXROM is not set
CONFIG_MTD_ESB2ROM=m
# CONFIG_MTD_CK804XROM is not set
CONFIG_MTD_SCB2_FLASH=m
# CONFIG_MTD_NETtel is not set
# CONFIG_MTD_DILNETPC is not set
# CONFIG_MTD_L440GX is not set
CONFIG_MTD_PCI=m
# CONFIG_MTD_INTEL_VR_NOR is not set
# CONFIG_MTD_PLATRAM is not set

#
# Self-contained MTD device drivers
#
CONFIG_MTD_PMC551=m
# CONFIG_MTD_PMC551_BUGFIX is not set
# CONFIG_MTD_PMC551_DEBUG is not set
# CONFIG_MTD_SLRAM is not set
# CONFIG_MTD_PHRAM is not set
# CONFIG_MTD_MTDRAM is not set
CONFIG_MTD_BLOCK2MTD=m

#
# Disk-On-Chip Device Drivers
#
# CONFIG_MTD_DOC2000 is not set
# CONFIG_MTD_DOC2001 is not set
# CONFIG_MTD_DOC2001PLUS is not set
# CONFIG_MTD_NAND is not set
# CONFIG_MTD_ONENAND is not set

#
# UBI - Unsorted block images
#
# CONFIG_MTD_UBI is not set
CONFIG_PARPORT=m
CONFIG_PARPORT_PC=m
CONFIG_PARPORT_SERIAL=m
# CONFIG_PARPORT_PC_FIFO is not set
# CONFIG_PARPORT_PC_SUPERIO is not set
CONFIG_PARPORT_PC_PCMCIA=m
# CONFIG_PARPORT_GSC is not set
# CONFIG_PARPORT_AX88796 is not set
CONFIG_PARPORT_1284=y
CONFIG_PARPORT_NOT_PC=y
CONFIG_PNP=y
# CONFIG_PNP_DEBUG is not set

#
# Protocols
#
CONFIG_ISAPNP=y
# CONFIG_PNPBIOS is not set
CONFIG_PNPACPI=y
CONFIG_BLK_DEV=y
# CONFIG_BLK_DEV_FD is not set
# CONFIG_BLK_DEV_XD is not set
CONFIG_PARIDE=m

#
# Parallel IDE high-level drivers
#
CONFIG_PARIDE_PD=m
CONFIG_PARIDE_PCD=m
CONFIG_PARIDE_PF=m
# CONFIG_PARIDE_PT is not set
CONFIG_PARIDE_PG=m

#
# Parallel IDE protocol modules
#
# CONFIG_PARIDE_ATEN is not set
# CONFIG_PARIDE_BPCK is not set
# CONFIG_PARIDE_BPCK6 is not set
# CONFIG_PARIDE_COMM is not set
# CONFIG_PARIDE_DSTR is not set
# CONFIG_PARIDE_FIT2 is not set
# CONFIG_PARIDE_FIT3 is not set
# CONFIG_PARIDE_EPAT is not set
# CONFIG_PARIDE_EPIA is not set
# CONFIG_PARIDE_FRIQ is not set
# CONFIG_PARIDE_FRPW is not set
# CONFIG_PARIDE_KBIC is not set
# CONFIG_PARIDE_KTTI is not set
# CONFIG_PARIDE_ON20 is not set
# CONFIG_PARIDE_ON26 is not set
# CONFIG_BLK_CPQ_DA is not set
# CONFIG_BLK_CPQ_CISS_DA is not set
# CONFIG_BLK_DEV_DAC960 is not set
# CONFIG_BLK_DEV_UMEM is not set
# CONFIG_BLK_DEV_COW_COMMON is not set
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_NBD=m
# CONFIG_BLK_DEV_SX8 is not set
# CONFIG_BLK_DEV_UB is not set
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=16384
# CONFIG_BLK_DEV_XIP is not set
# CONFIG_CDROM_PKTCDVD is not set
# CONFIG_ATA_OVER_ETH is not set
# CONFIG_BLK_DEV_HD is not set
CONFIG_MISC_DEVICES=y
# CONFIG_IBM_ASM is not set
# CONFIG_PHANTOM is not set
CONFIG_EEPROM_93CX6=m
# CONFIG_SGI_IOC4 is not set
# CONFIG_TIFM_CORE is not set
# CONFIG_ACER_WMI is not set
# CONFIG_ASUS_LAPTOP is not set
# CONFIG_FUJITSU_LAPTOP is not set
# CONFIG_TC1100_WMI is not set
# CONFIG_MSI_LAPTOP is not set
# CONFIG_COMPAL_LAPTOP is not set
# CONFIG_SONY_LAPTOP is not set
# CONFIG_THINKPAD_ACPI is not set
# CONFIG_INTEL_MENLOW is not set
# CONFIG_ENCLOSURE_SERVICES is not set
# CONFIG_HP_ILO is not set
CONFIG_HAVE_IDE=y
# CONFIG_IDE is not set

#
# SCSI device support
#
# CONFIG_RAID_ATTRS is not set
CONFIG_SCSI=m
CONFIG_SCSI_DMA=y
CONFIG_SCSI_TGT=m
CONFIG_SCSI_NETLINK=y
CONFIG_SCSI_PROC_FS=y

#
# SCSI support type (disk, tape, CD-ROM)
#
CONFIG_BLK_DEV_SD=m
# CONFIG_CHR_DEV_ST is not set
# CONFIG_CHR_DEV_OSST is not set
CONFIG_BLK_DEV_SR=m
CONFIG_BLK_DEV_SR_VENDOR=y
CONFIG_CHR_DEV_SG=m
CONFIG_CHR_DEV_SCH=m

#
# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
#
CONFIG_SCSI_MULTI_LUN=y
# CONFIG_SCSI_CONSTANTS is not set
CONFIG_SCSI_LOGGING=y
CONFIG_SCSI_SCAN_ASYNC=y
CONFIG_SCSI_WAIT_SCAN=m

#
# SCSI Transports
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
# CONFIG_SCSI_FC_TGT_ATTRS is not set
CONFIG_SCSI_ISCSI_ATTRS=m
CONFIG_SCSI_SAS_ATTRS=m
CONFIG_SCSI_SAS_LIBSAS=m
CONFIG_SCSI_SAS_ATA=y
CONFIG_SCSI_SAS_HOST_SMP=y
# CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set
CONFIG_SCSI_SRP_ATTRS=m
# CONFIG_SCSI_SRP_TGT_ATTRS is not set
CONFIG_SCSI_LOWLEVEL=y
CONFIG_ISCSI_TCP=m
CONFIG_BLK_DEV_3W_XXXX_RAID=m
CONFIG_SCSI_3W_9XXX=m
# CONFIG_SCSI_7000FASST is not set
CONFIG_SCSI_ACARD=m
# CONFIG_SCSI_AHA152X is not set
# CONFIG_SCSI_AHA1542 is not set
# CONFIG_SCSI_AACRAID is not set
CONFIG_SCSI_AIC7XXX=m
CONFIG_AIC7XXX_CMDS_PER_DEVICE=4
CONFIG_AIC7XXX_RESET_DELAY_MS=15000
# CONFIG_AIC7XXX_DEBUG_ENABLE is not set
CONFIG_AIC7XXX_DEBUG_MASK=0
# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set
CONFIG_SCSI_AIC7XXX_OLD=m
CONFIG_SCSI_AIC79XX=m
CONFIG_AIC79XX_CMDS_PER_DEVICE=4
CONFIG_AIC79XX_RESET_DELAY_MS=15000
# CONFIG_AIC79XX_DEBUG_ENABLE is not set
CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
CONFIG_SCSI_AIC94XX=m
# CONFIG_AIC94XX_DEBUG is not set
# CONFIG_SCSI_DPT_I2O is not set
CONFIG_SCSI_ADVANSYS=m
# CONFIG_SCSI_IN2000 is not set
# CONFIG_SCSI_ARCMSR is not set
CONFIG_MEGARAID_NEWGEN=y
CONFIG_MEGARAID_MM=m
CONFIG_MEGARAID_MAILBOX=m
CONFIG_MEGARAID_LEGACY=m
CONFIG_MEGARAID_SAS=m
CONFIG_SCSI_HPTIOP=m
CONFIG_SCSI_BUSLOGIC=m
# CONFIG_SCSI_FLASHPOINT is not set
# CONFIG_SCSI_DMX3191D is not set
# CONFIG_SCSI_DTC3280 is not set
# CONFIG_SCSI_EATA is not set
# CONFIG_SCSI_FUTURE_DOMAIN is not set
CONFIG_SCSI_GDTH=m
# CONFIG_SCSI_GENERIC_NCR5380 is not set
# CONFIG_SCSI_GENERIC_NCR5380_MMIO is not set
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
CONFIG_SCSI_INIA100=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
# CONFIG_SCSI_IZIP_SLOW_CTR is not set
# CONFIG_SCSI_MVSAS is not set
# CONFIG_SCSI_NCR53C406A is not set
CONFIG_SCSI_STEX=m
CONFIG_SCSI_SYM53C8XX_2=m
CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
CONFIG_SCSI_SYM53C8XX_MMIO=y
# CONFIG_SCSI_IPR is not set
# CONFIG_SCSI_PAS16 is not set
# CONFIG_SCSI_QLOGIC_FAS is not set
# CONFIG_SCSI_QLOGIC_1280 is not set
# CONFIG_SCSI_QLA_FC is not set
# CONFIG_SCSI_QLA_ISCSI is not set
# CONFIG_SCSI_LPFC is not set
# CONFIG_SCSI_SYM53C416 is not set
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_T128 is not set
# CONFIG_SCSI_U14_34F is not set
# CONFIG_SCSI_ULTRASTOR is not set
# CONFIG_SCSI_NSP32 is not set
# CONFIG_SCSI_DEBUG is not set
# CONFIG_SCSI_SRP is not set
CONFIG_SCSI_LOWLEVEL_PCMCIA=y
# CONFIG_PCMCIA_AHA152X is not set
# CONFIG_PCMCIA_FDOMAIN is not set
# CONFIG_PCMCIA_NINJA_SCSI is not set
CONFIG_PCMCIA_QLOGIC=m
# CONFIG_PCMCIA_SYM53C500 is not set
# CONFIG_SCSI_DH is not set
CONFIG_ATA=m
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_ACPI=y
CONFIG_SATA_PMP=y
CONFIG_SATA_AHCI=m
# CONFIG_SATA_SIL24 is not set
CONFIG_ATA_SFF=y
# CONFIG_SATA_SVW is not set
CONFIG_ATA_PIIX=m
# CONFIG_SATA_MV is not set
CONFIG_SATA_NV=m
# CONFIG_PDC_ADMA is not set
# CONFIG_SATA_QSTOR is not set
# CONFIG_SATA_PROMISE is not set
# CONFIG_SATA_SX4 is not set
# CONFIG_SATA_SIL is not set
CONFIG_SATA_SIS=m
# CONFIG_SATA_ULI is not set
# CONFIG_SATA_VIA is not set
# CONFIG_SATA_VITESSE is not set
# CONFIG_SATA_INIC162X is not set
# CONFIG_PATA_ACPI is not set
# CONFIG_PATA_ALI is not set
# CONFIG_PATA_AMD is not set
# CONFIG_PATA_ARTOP is not set
CONFIG_PATA_ATIIXP=m
# CONFIG_PATA_CMD640_PCI is not set
# CONFIG_PATA_CMD64X is not set
# CONFIG_PATA_CS5520 is not set
# CONFIG_PATA_CS5530 is not set
# CONFIG_PATA_CS5535 is not set
# CONFIG_PATA_CS5536 is not set
# CONFIG_PATA_CYPRESS is not set
# CONFIG_PATA_EFAR is not set
CONFIG_ATA_GENERIC=m
# CONFIG_PATA_HPT366 is not set
# CONFIG_PATA_HPT37X is not set
# CONFIG_PATA_HPT3X2N is not set
# CONFIG_PATA_HPT3X3 is not set
# CONFIG_PATA_ISAPNP is not set
# CONFIG_PATA_IT821X is not set
# CONFIG_PATA_IT8213 is not set
# CONFIG_PATA_JMICRON is not set
# CONFIG_PATA_LEGACY is not set
# CONFIG_PATA_TRIFLEX is not set
# CONFIG_PATA_MARVELL is not set
CONFIG_PATA_MPIIX=m
# CONFIG_PATA_OLDPIIX is not set
# CONFIG_PATA_NETCELL is not set
# CONFIG_PATA_NINJA32 is not set
# CONFIG_PATA_NS87410 is not set
# CONFIG_PATA_NS87415 is not set
# CONFIG_PATA_OPTI is not set
# CONFIG_PATA_OPTIDMA is not set
CONFIG_PATA_PCMCIA=m
# CONFIG_PATA_PDC_OLD is not set
# CONFIG_PATA_QDI is not set
# CONFIG_PATA_RADISYS is not set
# CONFIG_PATA_RZ1000 is not set
# CONFIG_PATA_SC1200 is not set
# CONFIG_PATA_SERVERWORKS is not set
# CONFIG_PATA_PDC2027X is not set
# CONFIG_PATA_SIL680 is not set
CONFIG_PATA_SIS=m
CONFIG_PATA_VIA=m
# CONFIG_PATA_WINBOND is not set
# CONFIG_PATA_WINBOND_VLB is not set
# CONFIG_PATA_SCH is not set
CONFIG_MD=y
CONFIG_BLK_DEV_MD=m
# CONFIG_MD_LINEAR is not set
# CONFIG_MD_RAID0 is not set
# CONFIG_MD_RAID1 is not set
# CONFIG_MD_RAID10 is not set
CONFIG_MD_RAID456=m
CONFIG_MD_RAID5_RESHAPE=y
# CONFIG_MD_MULTIPATH is not set
# CONFIG_MD_FAULTY is not set
CONFIG_BLK_DEV_DM=m
CONFIG_DM_DEBUG=y
# CONFIG_DM_CRYPT is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
CONFIG_DM_MULTIPATH=m
# CONFIG_DM_DELAY is not set
# CONFIG_DM_UEVENT is not set
CONFIG_FUSION=y
CONFIG_FUSION_SPI=m
CONFIG_FUSION_FC=m
# CONFIG_FUSION_SAS is not set
CONFIG_FUSION_MAX_SGE=40
CONFIG_FUSION_CTL=m
CONFIG_FUSION_LAN=m
CONFIG_FUSION_LOGGING=y

#
# IEEE 1394 (FireWire) support
#

#
# Enable only one of the two stacks, unless you know what you are doing
#
CONFIG_FIREWIRE=m
CONFIG_FIREWIRE_OHCI=m
CONFIG_FIREWIRE_OHCI_DEBUG=y
CONFIG_FIREWIRE_SBP2=m
# CONFIG_IEEE1394 is not set
CONFIG_I2O=m
# CONFIG_I2O_LCT_NOTIFY_ON_CHANGES is not set
CONFIG_I2O_EXT_ADAPTEC=y
CONFIG_I2O_CONFIG=m
CONFIG_I2O_CONFIG_OLD_IOCTL=y
CONFIG_I2O_BUS=m
CONFIG_I2O_BLOCK=m
CONFIG_I2O_SCSI=m
CONFIG_I2O_PROC=m
# CONFIG_MACINTOSH_DRIVERS is not set
CONFIG_NETDEVICES=y
CONFIG_IFB=m
CONFIG_DUMMY=m
CONFIG_BONDING=m
# CONFIG_MACVLAN is not set
# CONFIG_EQUALIZER is not set
CONFIG_TUN=m
# CONFIG_VETH is not set
CONFIG_NET_SB1000=m
# CONFIG_ARCNET is not set
CONFIG_PHYLIB=m

#
# MII PHY device drivers
#
# CONFIG_MARVELL_PHY is not set
# CONFIG_DAVICOM_PHY is not set
# CONFIG_QSEMI_PHY is not set
CONFIG_LXT_PHY=m
# CONFIG_CICADA_PHY is not set
# CONFIG_VITESSE_PHY is not set
# CONFIG_SMSC_PHY is not set
# CONFIG_BROADCOM_PHY is not set
# CONFIG_ICPLUS_PHY is not set
# CONFIG_REALTEK_PHY is not set
# CONFIG_MDIO_BITBANG is not set
CONFIG_NET_ETHERNET=y
CONFIG_MII=m
# CONFIG_HAPPYMEAL is not set
# CONFIG_SUNGEM is not set
# CONFIG_CASSINI is not set
CONFIG_NET_VENDOR_3COM=y
# CONFIG_EL1 is not set
# CONFIG_EL2 is not set
# CONFIG_ELPLUS is not set
# CONFIG_EL16 is not set
CONFIG_EL3=m
# CONFIG_3C515 is not set
CONFIG_VORTEX=m
CONFIG_TYPHOON=m
# CONFIG_LANCE is not set
CONFIG_NET_VENDOR_SMC=y
# CONFIG_WD80x3 is not set
# CONFIG_ULTRA is not set
# CONFIG_SMC9194 is not set
# CONFIG_NET_VENDOR_RACAL is not set
CONFIG_NET_TULIP=y
CONFIG_DE2104X=m
CONFIG_TULIP=m
# CONFIG_TULIP_MWI is not set
CONFIG_TULIP_MMIO=y
# CONFIG_TULIP_NAPI is not set
CONFIG_DE4X5=m
CONFIG_WINBOND_840=m
CONFIG_DM9102=m
CONFIG_ULI526X=m
CONFIG_PCMCIA_XIRCOM=m
# CONFIG_AT1700 is not set
# CONFIG_DEPCA is not set
# CONFIG_HP100 is not set
CONFIG_NET_ISA=y
# CONFIG_E2100 is not set
# CONFIG_EWRK3 is not set
# CONFIG_EEXPRESS is not set
# CONFIG_EEXPRESS_PRO is not set
# CONFIG_HPLAN_PLUS is not set
# CONFIG_HPLAN is not set
# CONFIG_LP486E is not set
# CONFIG_ETH16I is not set
CONFIG_NE2000=m
# CONFIG_ZNET is not set
# CONFIG_SEEQ8005 is not set
# CONFIG_IBM_NEW_EMAC_ZMII is not set
# CONFIG_IBM_NEW_EMAC_RGMII is not set
# CONFIG_IBM_NEW_EMAC_TAH is not set
# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
CONFIG_NET_PCI=y
CONFIG_PCNET32=m
CONFIG_AMD8111_ETH=m
CONFIG_ADAPTEC_STARFIRE=m
# CONFIG_AC3200 is not set
# CONFIG_APRICOT is not set
CONFIG_B44=m
CONFIG_B44_PCI_AUTOSELECT=y
CONFIG_B44_PCICORE_AUTOSELECT=y
CONFIG_B44_PCI=y
CONFIG_FORCEDETH=m
CONFIG_FORCEDETH_NAPI=y
# CONFIG_CS89x0 is not set
# CONFIG_EEPRO100 is not set
CONFIG_E100=m
# CONFIG_FEALNX is not set
# CONFIG_NATSEMI is not set
CONFIG_NE2K_PCI=m
# CONFIG_8139CP is not set
CONFIG_8139TOO=m
# CONFIG_8139TOO_PIO is not set
# CONFIG_8139TOO_TUNE_TWISTER is not set
CONFIG_8139TOO_8129=y
# CONFIG_8139_OLD_RX_RESET is not set
# CONFIG_R6040 is not set
CONFIG_SIS900=m
# CONFIG_EPIC100 is not set
# CONFIG_SUNDANCE is not set
# CONFIG_TLAN is not set
CONFIG_VIA_RHINE=m
CONFIG_VIA_RHINE_MMIO=y
# CONFIG_SC92031 is not set
CONFIG_NET_POCKET=y
CONFIG_ATP=m
CONFIG_DE600=m
CONFIG_DE620=m
# CONFIG_ATL2 is not set
CONFIG_NETDEV_1000=y
CONFIG_ACENIC=m
# CONFIG_ACENIC_OMIT_TIGON_I is not set
# CONFIG_DL2K is not set
CONFIG_E1000=m
CONFIG_E1000E=m
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
# CONFIG_NS83820 is not set
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
CONFIG_R8169=m
CONFIG_R8169_VLAN=y
# CONFIG_SIS190 is not set
CONFIG_SKGE=m
# CONFIG_SKGE_DEBUG is not set
CONFIG_SKY2=m
# CONFIG_SKY2_DEBUG is not set
CONFIG_VIA_VELOCITY=m
# CONFIG_TIGON3 is not set
# CONFIG_BNX2 is not set
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
# CONFIG_JME is not set
# CONFIG_NETDEV_10000 is not set
# CONFIG_TR is not set

#
# Wireless LAN
#
# CONFIG_WLAN_PRE80211 is not set
# CONFIG_WLAN_80211 is not set
# CONFIG_IWLWIFI_LEDS is not set

#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
# CONFIG_USB_KAWETH is not set
# CONFIG_USB_PEGASUS is not set
# CONFIG_USB_RTL8150 is not set
CONFIG_USB_USBNET=m
CONFIG_USB_NET_AX8817X=m
CONFIG_USB_NET_CDCETHER=m
CONFIG_USB_NET_DM9601=m
# CONFIG_USB_NET_SMSC95XX is not set
CONFIG_USB_NET_GL620A=m
CONFIG_USB_NET_NET1080=m
# CONFIG_USB_NET_PLUSB is not set
# CONFIG_USB_NET_MCS7830 is not set
# CONFIG_USB_NET_RNDIS_HOST is not set
CONFIG_USB_NET_CDC_SUBSET=m
CONFIG_USB_ALI_M5632=y
CONFIG_USB_AN2720=y
CONFIG_USB_BELKIN=y
CONFIG_USB_ARMLINUX=y
CONFIG_USB_EPSON2888=y
CONFIG_USB_KC2190=y
# CONFIG_USB_NET_ZAURUS is not set
CONFIG_NET_PCMCIA=y
# CONFIG_PCMCIA_3C589 is not set
# CONFIG_PCMCIA_3C574 is not set
# CONFIG_PCMCIA_FMVJ18X is not set
CONFIG_PCMCIA_PCNET=m
CONFIG_PCMCIA_NMCLAN=m
CONFIG_PCMCIA_SMC91C92=m
# CONFIG_PCMCIA_XIRC2PS is not set
# CONFIG_PCMCIA_AXNET is not set
# CONFIG_WAN is not set
CONFIG_ATM_DRIVERS=y
# CONFIG_ATM_DUMMY is not set
CONFIG_ATM_TCP=m
# CONFIG_ATM_LANAI is not set
# CONFIG_ATM_ENI is not set
# CONFIG_ATM_FIRESTREAM is not set
# CONFIG_ATM_ZATM is not set
# CONFIG_ATM_NICSTAR is not set
CONFIG_ATM_IDT77252=m
# CONFIG_ATM_IDT77252_DEBUG is not set
# CONFIG_ATM_IDT77252_RCV_ALL is not set
CONFIG_ATM_IDT77252_USE_SUNI=y
# CONFIG_ATM_AMBASSADOR is not set
# CONFIG_ATM_HORIZON is not set
# CONFIG_ATM_IA is not set
# CONFIG_ATM_FORE200E is not set
# CONFIG_ATM_HE is not set
CONFIG_FDDI=y
# CONFIG_DEFXX is not set
# CONFIG_SKFP is not set
# CONFIG_HIPPI is not set
CONFIG_PLIP=m
CONFIG_PPP=m
CONFIG_PPP_MULTILINK=y
CONFIG_PPP_FILTER=y
CONFIG_PPP_ASYNC=m
CONFIG_PPP_SYNC_TTY=m
CONFIG_PPP_DEFLATE=m
# CONFIG_PPP_BSDCOMP is not set
# CONFIG_PPP_MPPE is not set
CONFIG_PPPOE=m
# CONFIG_PPPOATM is not set
# CONFIG_PPPOL2TP is not set
CONFIG_SLIP=m
CONFIG_SLIP_COMPRESSED=y
CONFIG_SLHC=m
CONFIG_SLIP_SMART=y
# CONFIG_SLIP_MODE_SLIP6 is not set
CONFIG_NET_FC=y
CONFIG_NETCONSOLE=m
# CONFIG_NETCONSOLE_DYNAMIC is not set
CONFIG_NETPOLL=y
CONFIG_NETPOLL_TRAP=y
CONFIG_NET_POLL_CONTROLLER=y
# CONFIG_ISDN is not set
# CONFIG_PHONE is not set

#
# Input device support
#
CONFIG_INPUT=y
CONFIG_INPUT_FF_MEMLESS=y
CONFIG_INPUT_POLLDEV=m

#
# Userland interfaces
#
CONFIG_INPUT_MOUSEDEV=y
# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
# CONFIG_INPUT_JOYDEV is not set
CONFIG_INPUT_EVDEV=y
# CONFIG_INPUT_EVBUG is not set

#
# Input Device Drivers
#
CONFIG_INPUT_KEYBOARD=y
CONFIG_KEYBOARD_ATKBD=y
# CONFIG_KEYBOARD_SUNKBD is not set
# CONFIG_KEYBOARD_LKKBD is not set
# CONFIG_KEYBOARD_XTKBD is not set
# CONFIG_KEYBOARD_NEWTON is not set
# CONFIG_KEYBOARD_STOWAWAY is not set
CONFIG_INPUT_MOUSE=y
CONFIG_MOUSE_PS2=y
CONFIG_MOUSE_PS2_ALPS=y
CONFIG_MOUSE_PS2_LOGIPS2PP=y
CONFIG_MOUSE_PS2_SYNAPTICS=y
CONFIG_MOUSE_PS2_LIFEBOOK=y
CONFIG_MOUSE_PS2_TRACKPOINT=y
# CONFIG_MOUSE_PS2_TOUCHKIT is not set
CONFIG_MOUSE_SERIAL=m
CONFIG_MOUSE_APPLETOUCH=m
# CONFIG_MOUSE_BCM5974 is not set
# CONFIG_MOUSE_INPORT is not set
# CONFIG_MOUSE_LOGIBM is not set
# CONFIG_MOUSE_PC110PAD is not set
CONFIG_MOUSE_VSXXXAA=m
# CONFIG_INPUT_JOYSTICK is not set
# CONFIG_INPUT_TABLET is not set
# CONFIG_INPUT_TOUCHSCREEN is not set
CONFIG_INPUT_MISC=y
# CONFIG_INPUT_PCSPKR is not set
# CONFIG_INPUT_APANEL is not set
# CONFIG_INPUT_WISTRON_BTNS is not set
# CONFIG_INPUT_ATLAS_BTNS is not set
# CONFIG_INPUT_ATI_REMOTE is not set
# CONFIG_INPUT_ATI_REMOTE2 is not set
# CONFIG_INPUT_KEYSPAN_REMOTE is not set
# CONFIG_INPUT_POWERMATE is not set
# CONFIG_INPUT_YEALINK is not set
# CONFIG_INPUT_CM109 is not set
CONFIG_INPUT_UINPUT=m

#
# Hardware I/O ports
#
CONFIG_SERIO=y
CONFIG_SERIO_I8042=y
CONFIG_SERIO_SERPORT=y
# CONFIG_SERIO_CT82C710 is not set
# CONFIG_SERIO_PARKBD is not set
# CONFIG_SERIO_PCIPS2 is not set
CONFIG_SERIO_LIBPS2=y
CONFIG_SERIO_RAW=m
# CONFIG_GAMEPORT is not set

#
# Character devices
#
CONFIG_VT=y
CONFIG_CONSOLE_TRANSLATIONS=y
CONFIG_VT_CONSOLE=y
CONFIG_HW_CONSOLE=y
CONFIG_VT_HW_CONSOLE_BINDING=y
CONFIG_DEVKMEM=y
CONFIG_SERIAL_NONSTANDARD=y
# CONFIG_COMPUTONE is not set
CONFIG_ROCKETPORT=m
CONFIG_CYCLADES=m
# CONFIG_CYZ_INTR is not set
# CONFIG_DIGIEPCA is not set
# CONFIG_ESPSERIAL is not set
# CONFIG_MOXA_INTELLIO is not set
# CONFIG_MOXA_SMARTIO is not set
# CONFIG_ISI is not set
# CONFIG_SYNCLINK is not set
CONFIG_SYNCLINKMP=m
CONFIG_SYNCLINK_GT=m
# CONFIG_N_HDLC is not set
# CONFIG_RISCOM8 is not set
# CONFIG_SPECIALIX is not set
# CONFIG_SX is not set
# CONFIG_RIO is not set
# CONFIG_STALDRV is not set
# CONFIG_NOZOMI is not set

#
# Serial drivers
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_FIX_EARLYCON_MEM=y
CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_PNP=y
CONFIG_SERIAL_8250_CS=m
CONFIG_SERIAL_8250_NR_UARTS=32
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
CONFIG_SERIAL_8250_EXTENDED=y
CONFIG_SERIAL_8250_MANY_PORTS=y
# CONFIG_SERIAL_8250_FOURPORT is not set
# CONFIG_SERIAL_8250_ACCENT is not set
# CONFIG_SERIAL_8250_BOCA is not set
# CONFIG_SERIAL_8250_EXAR_ST16C554 is not set
# CONFIG_SERIAL_8250_HUB6 is not set
CONFIG_SERIAL_8250_SHARE_IRQ=y
CONFIG_SERIAL_8250_DETECT_IRQ=y
CONFIG_SERIAL_8250_RSA=y

#
# Non-8250 serial port support
#
CONFIG_SERIAL_CORE=y
CONFIG_SERIAL_CORE_CONSOLE=y
CONFIG_SERIAL_JSM=m
CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set
CONFIG_PRINTER=m
CONFIG_LP_CONSOLE=y
CONFIG_PPDEV=m
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
CONFIG_IPMI_SI=m
CONFIG_IPMI_WATCHDOG=m
CONFIG_IPMI_POWEROFF=m
CONFIG_HW_RANDOM=y
CONFIG_HW_RANDOM_INTEL=m
CONFIG_HW_RANDOM_AMD=m
CONFIG_HW_RANDOM_GEODE=m
CONFIG_HW_RANDOM_VIA=m
CONFIG_NVRAM=y
CONFIG_RTC=y
# CONFIG_DTLK is not set
# CONFIG_R3964 is not set
# CONFIG_APPLICOM is not set
# CONFIG_SONYPI is not set

#
# PCMCIA character devices
#
# CONFIG_SYNCLINK_CS is not set
CONFIG_CARDMAN_4000=m
CONFIG_CARDMAN_4040=m
# CONFIG_IPWIRELESS is not set
CONFIG_MWAVE=m
# CONFIG_PC8736x_GPIO is not set
# CONFIG_NSC_GPIO is not set
# CONFIG_CS5535_GPIO is not set
# CONFIG_RAW_DRIVER is not set
CONFIG_HPET=y
# CONFIG_HPET_MMAP is not set
CONFIG_HANGCHECK_TIMER=m
# CONFIG_TCG_TPM is not set
# CONFIG_TELCLOCK is not set
CONFIG_DEVPORT=y
CONFIG_I2C=m
CONFIG_I2C_BOARDINFO=y
CONFIG_I2C_CHARDEV=m
CONFIG_I2C_HELPER_AUTO=y
CONFIG_I2C_ALGOBIT=m
CONFIG_I2C_ALGOPCA=m

#
# I2C Hardware Bus support
#

#
# PC SMBus host controller drivers
#
CONFIG_I2C_ALI1535=m
CONFIG_I2C_ALI1563=m
CONFIG_I2C_ALI15X3=m
CONFIG_I2C_AMD756=m
CONFIG_I2C_AMD756_S4882=m
# CONFIG_I2C_AMD8111 is not set
CONFIG_I2C_I801=m
# CONFIG_I2C_ISCH is not set
CONFIG_I2C_PIIX4=m
CONFIG_I2C_NFORCE2=m
# CONFIG_I2C_NFORCE2_S4985 is not set
# CONFIG_I2C_SIS5595 is not set
# CONFIG_I2C_SIS630 is not set
# CONFIG_I2C_SIS96X is not set
CONFIG_I2C_VIA=m
CONFIG_I2C_VIAPRO=m

#
# I2C system bus drivers (mostly embedded / system-on-chip)
#
# CONFIG_I2C_OCORES is not set
CONFIG_I2C_SIMTEC=m

#
# External I2C/SMBus adapter drivers
#
CONFIG_I2C_PARPORT=m
CONFIG_I2C_PARPORT_LIGHT=m
# CONFIG_I2C_TAOS_EVM is not set
# CONFIG_I2C_TINY_USB is not set

#
# Graphics adapter I2C/DDC channel drivers
#
CONFIG_I2C_VOODOO3=m

#
# Other I2C/SMBus bus drivers
#
CONFIG_I2C_PCA_ISA=m
# CONFIG_I2C_PCA_PLATFORM is not set
CONFIG_I2C_STUB=m
# CONFIG_SCx200_ACB is not set

#
# Miscellaneous I2C Chip support
#
# CONFIG_DS1682 is not set
# CONFIG_AT24 is not set
CONFIG_SENSORS_EEPROM=m
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
# CONFIG_SENSORS_PCF8591 is not set
CONFIG_SENSORS_MAX6875=m
# CONFIG_SENSORS_TSL2550 is not set
# CONFIG_I2C_DEBUG_CORE is not set
# CONFIG_I2C_DEBUG_ALGO is not set
# CONFIG_I2C_DEBUG_BUS is not set
# CONFIG_I2C_DEBUG_CHIP is not set
# CONFIG_SPI is not set
CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
# CONFIG_GPIOLIB is not set
# CONFIG_W1 is not set
CONFIG_POWER_SUPPLY=y
# CONFIG_POWER_SUPPLY_DEBUG is not set
# CONFIG_PDA_POWER is not set
# CONFIG_BATTERY_DS2760 is not set
# CONFIG_BATTERY_BQ27x00 is not set
CONFIG_HWMON=m
CONFIG_HWMON_VID=m
# CONFIG_SENSORS_ABITUGURU is not set
# CONFIG_SENSORS_ABITUGURU3 is not set
# CONFIG_SENSORS_AD7414 is not set
CONFIG_SENSORS_AD7418=m
# CONFIG_SENSORS_ADM1021 is not set
# CONFIG_SENSORS_ADM1025 is not set
# CONFIG_SENSORS_ADM1026 is not set
# CONFIG_SENSORS_ADM1029 is not set
# CONFIG_SENSORS_ADM1031 is not set
# CONFIG_SENSORS_ADM9240 is not set
# CONFIG_SENSORS_ADT7470 is not set
# CONFIG_SENSORS_ADT7473 is not set
# CONFIG_SENSORS_K8TEMP is not set
# CONFIG_SENSORS_ASB100 is not set
# CONFIG_SENSORS_ATXP1 is not set
# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_I5K_AMB is not set
# CONFIG_SENSORS_F71805F is not set
# CONFIG_SENSORS_F71882FG is not set
# CONFIG_SENSORS_F75375S is not set
# CONFIG_SENSORS_FSCHER is not set
# CONFIG_SENSORS_FSCPOS is not set
# CONFIG_SENSORS_FSCHMD is not set
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
CONFIG_SENSORS_CORETEMP=m
# CONFIG_SENSORS_IBMAEM is not set
# CONFIG_SENSORS_IBMPEX is not set
# CONFIG_SENSORS_IT87 is not set
# CONFIG_SENSORS_LM63 is not set
# CONFIG_SENSORS_LM75 is not set
# CONFIG_SENSORS_LM77 is not set
# CONFIG_SENSORS_LM78 is not set
# CONFIG_SENSORS_LM80 is not set
# CONFIG_SENSORS_LM83 is not set
# CONFIG_SENSORS_LM85 is not set
# CONFIG_SENSORS_LM87 is not set
# CONFIG_SENSORS_LM90 is not set
# CONFIG_SENSORS_LM92 is not set
# CONFIG_SENSORS_LM93 is not set
# CONFIG_SENSORS_MAX1619 is not set
# CONFIG_SENSORS_MAX6650 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
CONFIG_SENSORS_SIS5595=m
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_SMSC47M1 is not set
# CONFIG_SENSORS_SMSC47M192 is not set
# CONFIG_SENSORS_SMSC47B397 is not set
# CONFIG_SENSORS_ADS7828 is not set
# CONFIG_SENSORS_THMC50 is not set
CONFIG_SENSORS_VIA686A=m
CONFIG_SENSORS_VT1211=m
CONFIG_SENSORS_VT8231=m
# CONFIG_SENSORS_W83781D is not set
# CONFIG_SENSORS_W83791D is not set
# CONFIG_SENSORS_W83792D is not set
# CONFIG_SENSORS_W83793 is not set
# CONFIG_SENSORS_W83L785TS is not set
# CONFIG_SENSORS_W83L786NG is not set
# CONFIG_SENSORS_W83627HF is not set
# CONFIG_SENSORS_W83627EHF is not set
CONFIG_SENSORS_HDAPS=m
# CONFIG_SENSORS_APPLESMC is not set
# CONFIG_HWMON_DEBUG_CHIP is not set
CONFIG_THERMAL=y
# CONFIG_WATCHDOG is not set

#
# Sonics Silicon Backplane
#
CONFIG_SSB_POSSIBLE=y
CONFIG_SSB=m
CONFIG_SSB_SPROM=y
CONFIG_SSB_PCIHOST_POSSIBLE=y
CONFIG_SSB_PCIHOST=y
# CONFIG_SSB_B43_PCI_BRIDGE is not set
CONFIG_SSB_PCMCIAHOST_POSSIBLE=y
CONFIG_SSB_PCMCIAHOST=y
# CONFIG_SSB_DEBUG is not set
CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y
CONFIG_SSB_DRIVER_PCICORE=y

#
# Multifunction device drivers
#
# CONFIG_MFD_CORE is not set
# CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
# CONFIG_MFD_TMIO is not set
# CONFIG_MFD_WM8400 is not set
# CONFIG_MFD_WM8350_I2C is not set

#
# Multimedia devices
#

#
# Multimedia core support
#
CONFIG_VIDEO_DEV=m
CONFIG_VIDEO_V4L2_COMMON=m
CONFIG_VIDEO_ALLOW_V4L1=y
CONFIG_VIDEO_V4L1_COMPAT=y
# CONFIG_DVB_CORE is not set
CONFIG_VIDEO_MEDIA=m

#
# Multimedia drivers
#
# CONFIG_MEDIA_ATTACH is not set
CONFIG_MEDIA_TUNER=m
# CONFIG_MEDIA_TUNER_CUSTOMIZE is not set
CONFIG_MEDIA_TUNER_SIMPLE=m
CONFIG_MEDIA_TUNER_TDA8290=m
CONFIG_MEDIA_TUNER_TDA9887=m
CONFIG_MEDIA_TUNER_TEA5761=m
CONFIG_MEDIA_TUNER_TEA5767=m
CONFIG_MEDIA_TUNER_MT20XX=m
CONFIG_MEDIA_TUNER_XC2028=m
CONFIG_MEDIA_TUNER_XC5000=m
CONFIG_VIDEO_V4L2=m
CONFIG_VIDEO_V4L1=m
CONFIG_VIDEOBUF_GEN=m
CONFIG_VIDEOBUF_DMA_SG=m
CONFIG_VIDEO_BTCX=m
CONFIG_VIDEO_IR=m
CONFIG_VIDEO_TVEEPROM=m
CONFIG_VIDEO_TUNER=m
CONFIG_VIDEO_CAPTURE_DRIVERS=y
# CONFIG_VIDEO_ADV_DEBUG is not set
# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
# CONFIG_VIDEO_HELPER_CHIPS_AUTO is not set
CONFIG_VIDEO_IR_I2C=m

#
# Encoders/decoders and other helper chips
#

#
# Audio decoders
#
CONFIG_VIDEO_TVAUDIO=m
CONFIG_VIDEO_TDA7432=m
CONFIG_VIDEO_TDA9840=m
CONFIG_VIDEO_TDA9875=m
CONFIG_VIDEO_TEA6415C=m
CONFIG_VIDEO_TEA6420=m
CONFIG_VIDEO_MSP3400=m
# CONFIG_VIDEO_CS5345 is not set
CONFIG_VIDEO_CS53L32A=m
CONFIG_VIDEO_M52790=m
CONFIG_VIDEO_TLV320AIC23B=m
CONFIG_VIDEO_WM8775=m
CONFIG_VIDEO_WM8739=m
CONFIG_VIDEO_VP27SMPX=m

#
# Video decoders
#
CONFIG_VIDEO_BT819=m
CONFIG_VIDEO_BT856=m
CONFIG_VIDEO_BT866=m
CONFIG_VIDEO_KS0127=m
CONFIG_VIDEO_OV7670=m
# CONFIG_VIDEO_TCM825X is not set
CONFIG_VIDEO_SAA7110=m
CONFIG_VIDEO_SAA7111=m
CONFIG_VIDEO_SAA7114=m
CONFIG_VIDEO_SAA711X=m
CONFIG_VIDEO_SAA717X=m
CONFIG_VIDEO_SAA7191=m
CONFIG_VIDEO_TVP5150=m
CONFIG_VIDEO_VPX3220=m

#
# Video and audio decoders
#
CONFIG_VIDEO_CX25840=m

#
# MPEG video encoders
#
CONFIG_VIDEO_CX2341X=m

#
# Video encoders
#
CONFIG_VIDEO_SAA7127=m
CONFIG_VIDEO_SAA7185=m
CONFIG_VIDEO_ADV7170=m
CONFIG_VIDEO_ADV7175=m

#
# Video improvement chips
#
CONFIG_VIDEO_UPD64031A=m
CONFIG_VIDEO_UPD64083=m
# CONFIG_VIDEO_VIVI is not set
CONFIG_VIDEO_BT848=m
# CONFIG_VIDEO_SAA6588 is not set
# CONFIG_VIDEO_PMS is not set
# CONFIG_VIDEO_BWQCAM is not set
# CONFIG_VIDEO_CQCAM is not set
# CONFIG_VIDEO_W9966 is not set
CONFIG_VIDEO_CPIA=m
CONFIG_VIDEO_CPIA_PP=m
CONFIG_VIDEO_CPIA_USB=m
CONFIG_VIDEO_CPIA2=m
# CONFIG_VIDEO_SAA5246A is not set
# CONFIG_VIDEO_SAA5249 is not set
# CONFIG_VIDEO_STRADIS is not set
CONFIG_VIDEO_ZORAN=m
# CONFIG_VIDEO_ZORAN_DC30 is not set
CONFIG_VIDEO_ZORAN_ZR36060=m
CONFIG_VIDEO_ZORAN_BUZ=m
# CONFIG_VIDEO_ZORAN_DC10 is not set
CONFIG_VIDEO_ZORAN_LML33=m
# CONFIG_VIDEO_ZORAN_LML33R10 is not set
# CONFIG_VIDEO_ZORAN_AVS6EYES is not set
# CONFIG_VIDEO_SAA7134 is not set
# CONFIG_VIDEO_MXB is not set
# CONFIG_VIDEO_HEXIUM_ORION is not set
# CONFIG_VIDEO_HEXIUM_GEMINI is not set
# CONFIG_VIDEO_CX88 is not set
CONFIG_VIDEO_IVTV=m
# CONFIG_VIDEO_FB_IVTV is not set
# CONFIG_VIDEO_CAFE_CCIC is not set
# CONFIG_SOC_CAMERA is not set
# CONFIG_V4L_USB_DRIVERS is not set
CONFIG_RADIO_ADAPTERS=y
# CONFIG_RADIO_CADET is not set
# CONFIG_RADIO_RTRACK is not set
# CONFIG_RADIO_RTRACK2 is not set
# CONFIG_RADIO_AZTECH is not set
# CONFIG_RADIO_GEMTEK is not set
# CONFIG_RADIO_GEMTEK_PCI is not set
CONFIG_RADIO_MAXIRADIO=m
CONFIG_RADIO_MAESTRO=m
# CONFIG_RADIO_SF16FMI is not set
# CONFIG_RADIO_SF16FMR2 is not set
# CONFIG_RADIO_TERRATEC is not set
# CONFIG_RADIO_TRUST is not set
# CONFIG_RADIO_TYPHOON is not set
# CONFIG_RADIO_ZOLTRIX is not set
CONFIG_USB_DSBR=m
# CONFIG_USB_SI470X is not set
# CONFIG_USB_MR800 is not set
CONFIG_DAB=y
CONFIG_USB_DABUSB=m

#
# Graphics support
#
CONFIG_AGP=y
CONFIG_AGP_ALI=y
CONFIG_AGP_ATI=y
# CONFIG_AGP_AMD is not set
# CONFIG_AGP_AMD64 is not set
CONFIG_AGP_INTEL=y
CONFIG_AGP_NVIDIA=y
CONFIG_AGP_SIS=y
# CONFIG_AGP_SWORKS is not set
CONFIG_AGP_VIA=y
CONFIG_AGP_EFFICEON=y
CONFIG_DRM=m
CONFIG_DRM_TDFX=m
CONFIG_DRM_R128=m
CONFIG_DRM_RADEON=m
CONFIG_DRM_I810=m
CONFIG_DRM_I830=m
CONFIG_DRM_I915=m
# CONFIG_DRM_MGA is not set
CONFIG_DRM_SIS=m
# CONFIG_DRM_VIA is not set
# CONFIG_DRM_SAVAGE is not set
CONFIG_VGASTATE=m
CONFIG_VIDEO_OUTPUT_CONTROL=m
CONFIG_FB=y
# CONFIG_FIRMWARE_EDID is not set
CONFIG_FB_DDC=m
CONFIG_FB_BOOT_VESA_SUPPORT=y
CONFIG_FB_CFB_FILLRECT=y
CONFIG_FB_CFB_COPYAREA=y
CONFIG_FB_CFB_IMAGEBLIT=y
# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
# CONFIG_FB_SYS_FILLRECT is not set
# CONFIG_FB_SYS_COPYAREA is not set
# CONFIG_FB_SYS_IMAGEBLIT is not set
# CONFIG_FB_FOREIGN_ENDIAN is not set
# CONFIG_FB_SYS_FOPS is not set
CONFIG_FB_SVGALIB=m
# CONFIG_FB_MACMODES is not set
CONFIG_FB_BACKLIGHT=y
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y

#
# Frame buffer hardware drivers
#
# CONFIG_FB_CIRRUS is not set
# CONFIG_FB_PM2 is not set
# CONFIG_FB_CYBER2000 is not set
# CONFIG_FB_ARC is not set
# CONFIG_FB_ASILIANT is not set
# CONFIG_FB_IMSTT is not set
# CONFIG_FB_VGA16 is not set
# CONFIG_FB_UVESA is not set
CONFIG_FB_VESA=y
# CONFIG_FB_EFI is not set
# CONFIG_FB_N411 is not set
# CONFIG_FB_HGA is not set
# CONFIG_FB_S1D13XXX is not set
CONFIG_FB_NVIDIA=m
CONFIG_FB_NVIDIA_I2C=y
# CONFIG_FB_NVIDIA_DEBUG is not set
CONFIG_FB_NVIDIA_BACKLIGHT=y
# CONFIG_FB_RIVA is not set
# CONFIG_FB_I810 is not set
# CONFIG_FB_LE80578 is not set
# CONFIG_FB_INTEL is not set
# CONFIG_FB_MATROX is not set
CONFIG_FB_RADEON=m
CONFIG_FB_RADEON_I2C=y
CONFIG_FB_RADEON_BACKLIGHT=y
# CONFIG_FB_RADEON_DEBUG is not set
# CONFIG_FB_ATY128 is not set
# CONFIG_FB_ATY is not set
CONFIG_FB_S3=m
CONFIG_FB_SAVAGE=m
CONFIG_FB_SAVAGE_I2C=y
CONFIG_FB_SAVAGE_ACCEL=y
# CONFIG_FB_SIS is not set
# CONFIG_FB_VIA is not set
# CONFIG_FB_NEOMAGIC is not set
# CONFIG_FB_KYRO is not set
# CONFIG_FB_3DFX is not set
# CONFIG_FB_VOODOO1 is not set
# CONFIG_FB_VT8623 is not set
# CONFIG_FB_CYBLA is not set
CONFIG_FB_TRIDENT=m
CONFIG_FB_TRIDENT_ACCEL=y
# CONFIG_FB_ARK is not set
# CONFIG_FB_PM3 is not set
# CONFIG_FB_CARMINE is not set
# CONFIG_FB_GEODE is not set
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_LCD_CLASS_DEVICE=m
# CONFIG_LCD_ILI9320 is not set
# CONFIG_LCD_PLATFORM is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
# CONFIG_BACKLIGHT_CORGI is not set
CONFIG_BACKLIGHT_PROGEAR=m
# CONFIG_BACKLIGHT_MBP_NVIDIA is not set

#
# Display device support
#
CONFIG_DISPLAY_SUPPORT=m

#
# Display hardware drivers
#

#
# Console display driver support
#
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
# CONFIG_MDA_CONSOLE is not set
CONFIG_DUMMY_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
# CONFIG_FONTS is not set
CONFIG_FONT_8x8=y
CONFIG_FONT_8x16=y
CONFIG_LOGO=y
# CONFIG_LOGO_LINUX_MONO is not set
# CONFIG_LOGO_LINUX_VGA16 is not set
CONFIG_LOGO_LINUX_CLUT224=y
# CONFIG_SOUND is not set
CONFIG_HID_SUPPORT=y
CONFIG_HID=y
CONFIG_HID_DEBUG=y
# CONFIG_HIDRAW is not set

#
# USB Input Devices
#
CONFIG_USB_HID=y
CONFIG_HID_PID=y
CONFIG_USB_HIDDEV=y

#
# Special HID drivers
#
CONFIG_HID_COMPAT=y
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
CONFIG_HID_BRIGHT=y
CONFIG_HID_CHERRY=y
CONFIG_HID_CHICONY=y
CONFIG_HID_CYPRESS=y
CONFIG_HID_DELL=y
CONFIG_HID_EZKEY=y
CONFIG_HID_GYRATION=y
CONFIG_HID_LOGITECH=y
# CONFIG_LOGITECH_FF is not set
# CONFIG_LOGIRUMBLEPAD2_FF is not set
CONFIG_HID_MICROSOFT=y
CONFIG_HID_MONTEREY=y
CONFIG_HID_PANTHERLORD=y
# CONFIG_PANTHERLORD_FF is not set
CONFIG_HID_PETALYNX=y
CONFIG_HID_SAMSUNG=y
CONFIG_HID_SONY=y
CONFIG_HID_SUNPLUS=y
# CONFIG_THRUSTMASTER_FF is not set
# CONFIG_ZEROPLUS_FF is not set
CONFIG_USB_SUPPORT=y
CONFIG_USB_ARCH_HAS_HCD=y
CONFIG_USB_ARCH_HAS_OHCI=y
CONFIG_USB_ARCH_HAS_EHCI=y
CONFIG_USB=y
# CONFIG_USB_DEBUG is not set
# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set

#
# Miscellaneous USB options
#
CONFIG_USB_DEVICEFS=y
# CONFIG_USB_DEVICE_CLASS is not set
# CONFIG_USB_DYNAMIC_MINORS is not set
CONFIG_USB_SUSPEND=y
# CONFIG_USB_OTG is not set
# CONFIG_USB_MON is not set

#
# USB Host Controller Drivers
#
# CONFIG_USB_C67X00_HCD is not set
CONFIG_USB_EHCI_HCD=m
CONFIG_USB_EHCI_ROOT_HUB_TT=y
CONFIG_USB_EHCI_TT_NEWSCHED=y
# CONFIG_USB_ISP116X_HCD is not set
# CONFIG_USB_ISP1760_HCD is not set
CONFIG_USB_OHCI_HCD=m
# CONFIG_USB_OHCI_HCD_SSB is not set
# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set
CONFIG_USB_OHCI_LITTLE_ENDIAN=y
CONFIG_USB_UHCI_HCD=m
# CONFIG_USB_U132_HCD is not set
# CONFIG_USB_SL811_HCD is not set
# CONFIG_USB_R8A66597_HCD is not set

#
# USB Device Class drivers
#
# CONFIG_USB_ACM is not set
# CONFIG_USB_PRINTER is not set
# CONFIG_USB_WDM is not set
# CONFIG_USB_TMC is not set

#
# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
#

#
# may also be needed; see USB_STORAGE Help for more information
#
CONFIG_USB_STORAGE=m
# CONFIG_USB_STORAGE_DEBUG is not set
CONFIG_USB_STORAGE_DATAFAB=y
CONFIG_USB_STORAGE_FREECOM=y
# CONFIG_USB_STORAGE_ISD200 is not set
CONFIG_USB_STORAGE_DPCM=y
CONFIG_USB_STORAGE_USBAT=y
# CONFIG_USB_STORAGE_SDDR09 is not set
# CONFIG_USB_STORAGE_SDDR55 is not set
# CONFIG_USB_STORAGE_JUMPSHOT is not set
# CONFIG_USB_STORAGE_ALAUDA is not set
# CONFIG_USB_STORAGE_ONETOUCH is not set
# CONFIG_USB_STORAGE_KARMA is not set
# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
# CONFIG_USB_LIBUSUAL is not set

#
# USB Imaging devices
#
# CONFIG_USB_MDC800 is not set
# CONFIG_USB_MICROTEK is not set

#
# USB port drivers
#
CONFIG_USB_USS720=m
CONFIG_USB_SERIAL=m
CONFIG_USB_EZUSB=y
CONFIG_USB_SERIAL_GENERIC=y
# CONFIG_USB_SERIAL_AIRCABLE is not set
# CONFIG_USB_SERIAL_ARK3116 is not set
# CONFIG_USB_SERIAL_BELKIN is not set
# CONFIG_USB_SERIAL_CH341 is not set
# CONFIG_USB_SERIAL_WHITEHEAT is not set
# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set
CONFIG_USB_SERIAL_CP2101=m
# CONFIG_USB_SERIAL_CYPRESS_M8 is not set
CONFIG_USB_SERIAL_EMPEG=m
# CONFIG_USB_SERIAL_FTDI_SIO is not set
# CONFIG_USB_SERIAL_FUNSOFT is not set
# CONFIG_USB_SERIAL_VISOR is not set
# CONFIG_USB_SERIAL_IPAQ is not set
# CONFIG_USB_SERIAL_IR is not set
# CONFIG_USB_SERIAL_EDGEPORT is not set
# CONFIG_USB_SERIAL_EDGEPORT_TI is not set
# CONFIG_USB_SERIAL_GARMIN is not set
# CONFIG_USB_SERIAL_IPW is not set
# CONFIG_USB_SERIAL_IUU is not set
# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set
CONFIG_USB_SERIAL_KEYSPAN=m
# CONFIG_USB_SERIAL_KEYSPAN_MPR is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA28XA is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA28XB is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set
# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set
CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
# CONFIG_USB_SERIAL_KLSI is not set
# CONFIG_USB_SERIAL_KOBIL_SCT is not set
# CONFIG_USB_SERIAL_MCT_U232 is not set
# CONFIG_USB_SERIAL_MOS7720 is not set
# CONFIG_USB_SERIAL_MOS7840 is not set
# CONFIG_USB_SERIAL_MOTOROLA is not set
# CONFIG_USB_SERIAL_NAVMAN is not set
# CONFIG_USB_SERIAL_PL2303 is not set
# CONFIG_USB_SERIAL_OTI6858 is not set
# CONFIG_USB_SERIAL_SPCP8X5 is not set
# CONFIG_USB_SERIAL_HP4X is not set
# CONFIG_USB_SERIAL_SAFE is not set
# CONFIG_USB_SERIAL_SIERRAWIRELESS is not set
# CONFIG_USB_SERIAL_TI is not set
# CONFIG_USB_SERIAL_CYBERJACK is not set
# CONFIG_USB_SERIAL_XIRCOM is not set
# CONFIG_USB_SERIAL_OPTION is not set
# CONFIG_USB_SERIAL_OMNINET is not set
# CONFIG_USB_SERIAL_DEBUG is not set

#
# USB Miscellaneous drivers
#
# CONFIG_USB_EMI62 is not set
# CONFIG_USB_EMI26 is not set
# CONFIG_USB_ADUTUX is not set
# CONFIG_USB_SEVSEG is not set
# CONFIG_USB_RIO500 is not set
# CONFIG_USB_LEGOTOWER is not set
# CONFIG_USB_LCD is not set
# CONFIG_USB_BERRY_CHARGE is not set
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
CONFIG_USB_FTDI_ELAN=m
# CONFIG_USB_APPLEDISPLAY is not set
# CONFIG_USB_SISUSBVGA is not set
# CONFIG_USB_LD is not set
# CONFIG_USB_TRANCEVIBRATOR is not set
# CONFIG_USB_IOWARRIOR is not set
# CONFIG_USB_TEST is not set
# CONFIG_USB_ISIGHTFW is not set
# CONFIG_USB_VST is not set
# CONFIG_USB_ATM is not set
# CONFIG_USB_GADGET is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
CONFIG_NEW_LEDS=y
CONFIG_LEDS_CLASS=y

#
# LED drivers
#
# CONFIG_LEDS_PCA9532 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set

#
# LED Triggers
#
CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_TIMER=m
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
# CONFIG_ACCESSIBILITY is not set
CONFIG_INFINIBAND=m
CONFIG_INFINIBAND_USER_MAD=m
# CONFIG_INFINIBAND_USER_ACCESS is not set
CONFIG_INFINIBAND_ADDR_TRANS=y
# CONFIG_INFINIBAND_MTHCA is not set
# CONFIG_INFINIBAND_AMSO1100 is not set
# CONFIG_MLX4_INFINIBAND is not set
# CONFIG_INFINIBAND_NES is not set
# CONFIG_INFINIBAND_IPOIB is not set
CONFIG_INFINIBAND_SRP=m
# CONFIG_INFINIBAND_ISER is not set
# CONFIG_EDAC is not set
# CONFIG_RTC_CLASS is not set
# CONFIG_DMADEVICES is not set
# CONFIG_AUXDISPLAY is not set
CONFIG_UIO=m
# CONFIG_UIO_CIF is not set
# CONFIG_UIO_PDRV is not set
# CONFIG_UIO_PDRV_GENIRQ is not set
# CONFIG_UIO_SMX is not set
# CONFIG_UIO_SERCOS3 is not set
# CONFIG_STAGING is not set

#
# Firmware Drivers
#
CONFIG_EDD=m
# CONFIG_EDD_OFF is not set
CONFIG_FIRMWARE_MEMMAP=y
CONFIG_EFI_VARS=y
# CONFIG_DELL_RBU is not set
# CONFIG_DCDBAS is not set
CONFIG_DMIID=y
# CONFIG_ISCSI_IBFT_FIND is not set

#
# File systems
#
CONFIG_EXT2_FS=m
# CONFIG_EXT2_FS_XATTR is not set
CONFIG_EXT2_FS_XIP=y
CONFIG_EXT3_FS=m
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
# CONFIG_EXT4_FS is not set
CONFIG_FS_XIP=y
CONFIG_JBD=m
# CONFIG_JBD_DEBUG is not set
CONFIG_FS_MBCACHE=m
# CONFIG_REISERFS_FS is not set
# CONFIG_JFS_FS is not set
CONFIG_FS_POSIX_ACL=y
CONFIG_FILE_LOCKING=y
# CONFIG_XFS_FS is not set
# CONFIG_GFS2_FS is not set
# CONFIG_OCFS2_FS is not set
CONFIG_DNOTIFY=y
CONFIG_INOTIFY=y
CONFIG_INOTIFY_USER=y
# CONFIG_QUOTA is not set
# CONFIG_AUTOFS_FS is not set
CONFIG_AUTOFS4_FS=m
CONFIG_FUSE_FS=m
CONFIG_GENERIC_ACL=y

#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
CONFIG_JOLIET=y
CONFIG_ZISOFS=y
CONFIG_UDF_FS=y
CONFIG_UDF_NLS=y

#
# DOS/FAT/NT Filesystems
#
CONFIG_FAT_FS=m
CONFIG_MSDOS_FS=m
CONFIG_VFAT_FS=m
CONFIG_FAT_DEFAULT_CODEPAGE=437
CONFIG_FAT_DEFAULT_IOCHARSET="ascii"
# CONFIG_NTFS_FS is not set

#
# Pseudo filesystems
#
CONFIG_PROC_FS=y
CONFIG_PROC_KCORE=y
CONFIG_PROC_VMCORE=y
CONFIG_PROC_SYSCTL=y
CONFIG_PROC_PAGE_MONITOR=y
CONFIG_SYSFS=y
CONFIG_TMPFS=y
CONFIG_TMPFS_POSIX_ACL=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_CONFIGFS_FS=m

#
# Miscellaneous filesystems
#
# CONFIG_ADFS_FS is not set
# CONFIG_AFFS_FS is not set
# CONFIG_ECRYPT_FS is not set
# CONFIG_HFS_FS is not set
# CONFIG_HFSPLUS_FS is not set
# CONFIG_BEFS_FS is not set
# CONFIG_BFS_FS is not set
# CONFIG_EFS_FS is not set
# CONFIG_JFFS2_FS is not set
CONFIG_CRAMFS=m
# CONFIG_VXFS_FS is not set
# CONFIG_MINIX_FS is not set
# CONFIG_OMFS_FS is not set
# CONFIG_HPFS_FS is not set
# CONFIG_QNX4FS_FS is not set
CONFIG_ROMFS_FS=m
# CONFIG_SYSV_FS is not set
CONFIG_UFS_FS=m
# CONFIG_UFS_FS_WRITE is not set
# CONFIG_UFS_DEBUG is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=m
CONFIG_NFS_V3=y
CONFIG_NFS_V3_ACL=y
CONFIG_NFS_V4=y
# CONFIG_NFSD is not set
CONFIG_LOCKD=m
CONFIG_LOCKD_V4=y
CONFIG_NFS_ACL_SUPPORT=m
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=m
CONFIG_SUNRPC_GSS=m
CONFIG_SUNRPC_XPRT_RDMA=m
# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=m
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
# CONFIG_CIFS is not set
# CONFIG_NCP_FS is not set
# CONFIG_CODA_FS is not set
# CONFIG_AFS_FS is not set

#
# Partition Types
#
CONFIG_PARTITION_ADVANCED=y
# CONFIG_ACORN_PARTITION is not set
# CONFIG_OSF_PARTITION is not set
# CONFIG_AMIGA_PARTITION is not set
# CONFIG_ATARI_PARTITION is not set
# CONFIG_MAC_PARTITION is not set
CONFIG_MSDOS_PARTITION=y
CONFIG_BSD_DISKLABEL=y
# CONFIG_MINIX_SUBPARTITION is not set
# CONFIG_SOLARIS_X86_PARTITION is not set
# CONFIG_UNIXWARE_DISKLABEL is not set
# CONFIG_LDM_PARTITION is not set
# CONFIG_SGI_PARTITION is not set
# CONFIG_ULTRIX_PARTITION is not set
# CONFIG_SUN_PARTITION is not set
# CONFIG_KARMA_PARTITION is not set
CONFIG_EFI_PARTITION=y
# CONFIG_SYSV68_PARTITION is not set
CONFIG_NLS=y
CONFIG_NLS_DEFAULT="utf8"
CONFIG_NLS_CODEPAGE_437=y
# CONFIG_NLS_CODEPAGE_737 is not set
# CONFIG_NLS_CODEPAGE_775 is not set
CONFIG_NLS_CODEPAGE_850=m
CONFIG_NLS_CODEPAGE_852=m
# CONFIG_NLS_CODEPAGE_855 is not set
# CONFIG_NLS_CODEPAGE_857 is not set
# CONFIG_NLS_CODEPAGE_860 is not set
# CONFIG_NLS_CODEPAGE_861 is not set
# CONFIG_NLS_CODEPAGE_862 is not set
CONFIG_NLS_CODEPAGE_863=m
# CONFIG_NLS_CODEPAGE_864 is not set
# CONFIG_NLS_CODEPAGE_865 is not set
# CONFIG_NLS_CODEPAGE_866 is not set
# CONFIG_NLS_CODEPAGE_869 is not set
CONFIG_NLS_CODEPAGE_936=m
CONFIG_NLS_CODEPAGE_950=m
CONFIG_NLS_CODEPAGE_932=m
# CONFIG_NLS_CODEPAGE_949 is not set
# CONFIG_NLS_CODEPAGE_874 is not set
CONFIG_NLS_ISO8859_8=m
CONFIG_NLS_CODEPAGE_1250=m
CONFIG_NLS_CODEPAGE_1251=m
CONFIG_NLS_ASCII=y
# CONFIG_NLS_ISO8859_1 is not set
# CONFIG_NLS_ISO8859_2 is not set
# CONFIG_NLS_ISO8859_3 is not set
# CONFIG_NLS_ISO8859_4 is not set
# CONFIG_NLS_ISO8859_5 is not set
# CONFIG_NLS_ISO8859_6 is not set
# CONFIG_NLS_ISO8859_7 is not set
# CONFIG_NLS_ISO8859_9 is not set
# CONFIG_NLS_ISO8859_13 is not set
# CONFIG_NLS_ISO8859_14 is not set
# CONFIG_NLS_ISO8859_15 is not set
# CONFIG_NLS_KOI8_R is not set
# CONFIG_NLS_KOI8_U is not set
CONFIG_NLS_UTF8=m
# CONFIG_DLM is not set

#
# Kernel hacking
#
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
# CONFIG_PRINTK_TIME is not set
# CONFIG_ENABLE_WARN_DEPRECATED is not set
# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_DEBUG_FS=y
CONFIG_HEADERS_CHECK=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_SHIRQ=y
CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
CONFIG_SCHED_DEBUG=y
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
# CONFIG_DEBUG_OBJECTS is not set
# CONFIG_SLUB_DEBUG_ON is not set
# CONFIG_SLUB_STATS is not set
# CONFIG_DEBUG_RT_MUTEXES is not set
# CONFIG_RT_MUTEX_TESTER is not set
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCKDEP=y
# CONFIG_LOCK_STAT is not set
CONFIG_DEBUG_LOCKDEP=y
CONFIG_TRACE_IRQFLAGS=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
CONFIG_STACKTRACE=y
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_HIGHMEM=y
CONFIG_DEBUG_BUGVERBOSE=y
CONFIG_DEBUG_INFO=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_VIRTUAL is not set
# CONFIG_DEBUG_WRITECOUNT is not set
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_DEBUG_LIST=y
# CONFIG_DEBUG_SG is not set
CONFIG_FRAME_POINTER=y
# CONFIG_BOOT_PRINTK_DELAY is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_RCU_CPU_STALL_DETECTOR is not set
# CONFIG_KPROBES_SANITY_TEST is not set
# CONFIG_BACKTRACE_SELF_TEST is not set
# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
CONFIG_HAVE_FTRACE=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
# CONFIG_FTRACE is not set
# CONFIG_IRQSOFF_TRACER is not set
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set
# CONFIG_FIREWIRE_OHCI_REMOTE_DMA is not set
# CONFIG_BUILD_DOCSRC is not set
# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
CONFIG_SAMPLES=y
CONFIG_SAMPLE_KOBJECT=m
CONFIG_SAMPLE_KPROBES=m
CONFIG_SAMPLE_KRETPROBES=m
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
# CONFIG_STRICT_DEVMEM is not set
CONFIG_X86_VERBOSE_BOOTUP=y
CONFIG_EARLY_PRINTK=y
# CONFIG_EARLY_PRINTK_DBGP is not set
# CONFIG_DEBUG_STACKOVERFLOW is not set
# CONFIG_DEBUG_STACK_USAGE is not set
# CONFIG_DEBUG_PAGEALLOC is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_X86_PTDUMP is not set
CONFIG_DEBUG_RODATA=y
# CONFIG_DEBUG_RODATA_TEST is not set
# CONFIG_DEBUG_NX_TEST is not set
CONFIG_4KSTACKS=y
CONFIG_DOUBLEFAULT=y
# CONFIG_MMIOTRACE is not set
CONFIG_IO_DELAY_TYPE_0X80=0
CONFIG_IO_DELAY_TYPE_0XED=1
CONFIG_IO_DELAY_TYPE_UDELAY=2
CONFIG_IO_DELAY_TYPE_NONE=3
CONFIG_IO_DELAY_0X80=y
# CONFIG_IO_DELAY_0XED is not set
# CONFIG_IO_DELAY_UDELAY is not set
# CONFIG_IO_DELAY_NONE is not set
CONFIG_DEFAULT_IO_DELAY_TYPE=0
# CONFIG_DEBUG_BOOT_PARAMS is not set
# CONFIG_CPA_DEBUG is not set
# CONFIG_OPTIMIZE_INLINING is not set

#
# Security options
#
CONFIG_KEYS=y
CONFIG_KEYS_DEBUG_PROC_KEYS=y
# CONFIG_SECURITY is not set
# CONFIG_SECURITYFS is not set
# CONFIG_SECURITY_FILE_CAPABILITIES is not set
CONFIG_XOR_BLOCKS=m
CONFIG_ASYNC_CORE=m
CONFIG_ASYNC_MEMCPY=m
CONFIG_ASYNC_XOR=m
CONFIG_CRYPTO=y

#
# Crypto core or helper
#
# CONFIG_CRYPTO_FIPS is not set
CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_AEAD=y
CONFIG_CRYPTO_BLKCIPHER=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_RNG=y
CONFIG_CRYPTO_MANAGER=y
# CONFIG_CRYPTO_GF128MUL is not set
CONFIG_CRYPTO_NULL=m
# CONFIG_CRYPTO_CRYPTD is not set
# CONFIG_CRYPTO_AUTHENC is not set
# CONFIG_CRYPTO_TEST is not set

#
# Authenticated Encryption with Associated Data
#
# CONFIG_CRYPTO_CCM is not set
# CONFIG_CRYPTO_GCM is not set
# CONFIG_CRYPTO_SEQIV is not set

#
# Block modes
#
CONFIG_CRYPTO_CBC=m
# CONFIG_CRYPTO_CTR is not set
# CONFIG_CRYPTO_CTS is not set
# CONFIG_CRYPTO_ECB is not set
# CONFIG_CRYPTO_LRW is not set
# CONFIG_CRYPTO_PCBC is not set
# CONFIG_CRYPTO_XTS is not set

#
# Hash modes
#
CONFIG_CRYPTO_HMAC=y
# CONFIG_CRYPTO_XCBC is not set

#
# Digest
#
CONFIG_CRYPTO_CRC32C=y
# CONFIG_CRYPTO_CRC32C_INTEL is not set
CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MD5=y
# CONFIG_CRYPTO_MICHAEL_MIC is not set
# CONFIG_CRYPTO_RMD128 is not set
# CONFIG_CRYPTO_RMD160 is not set
# CONFIG_CRYPTO_RMD256 is not set
# CONFIG_CRYPTO_RMD320 is not set
CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_SHA256=m
CONFIG_CRYPTO_SHA512=m
# CONFIG_CRYPTO_TGR192 is not set
# CONFIG_CRYPTO_WP512 is not set

#
# Ciphers
#
CONFIG_CRYPTO_AES=m
CONFIG_CRYPTO_AES_586=m
# CONFIG_CRYPTO_ANUBIS is not set
# CONFIG_CRYPTO_ARC4 is not set
# CONFIG_CRYPTO_BLOWFISH is not set
# CONFIG_CRYPTO_CAMELLIA is not set
# CONFIG_CRYPTO_CAST5 is not set
# CONFIG_CRYPTO_CAST6 is not set
CONFIG_CRYPTO_DES=m
# CONFIG_CRYPTO_FCRYPT is not set
# CONFIG_CRYPTO_KHAZAD is not set
# CONFIG_CRYPTO_SALSA20 is not set
# CONFIG_CRYPTO_SALSA20_586 is not set
# CONFIG_CRYPTO_SEED is not set
# CONFIG_CRYPTO_SERPENT is not set
# CONFIG_CRYPTO_TEA is not set
# CONFIG_CRYPTO_TWOFISH is not set
# CONFIG_CRYPTO_TWOFISH_586 is not set

#
# Compression
#
# CONFIG_CRYPTO_DEFLATE is not set
# CONFIG_CRYPTO_LZO is not set

#
# Random Number Generation
#
# CONFIG_CRYPTO_ANSI_CPRNG is not set
CONFIG_CRYPTO_HW=y
CONFIG_CRYPTO_DEV_PADLOCK=m
CONFIG_CRYPTO_DEV_PADLOCK_AES=m
CONFIG_CRYPTO_DEV_PADLOCK_SHA=m
CONFIG_CRYPTO_DEV_GEODE=m
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
CONFIG_HAVE_KVM=y
# CONFIG_VIRTUALIZATION is not set

#
# Library routines
#
CONFIG_BITREVERSE=y
CONFIG_GENERIC_FIND_FIRST_BIT=y
CONFIG_GENERIC_FIND_NEXT_BIT=y
CONFIG_CRC_CCITT=m
CONFIG_CRC16=m
# CONFIG_CRC_T10DIF is not set
CONFIG_CRC_ITU_T=y
CONFIG_CRC32=y
# CONFIG_CRC7 is not set
CONFIG_LIBCRC32C=y
CONFIG_AUDIT_GENERIC=y
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=m
CONFIG_TEXTSEARCH=y
CONFIG_TEXTSEARCH_KMP=m
CONFIG_TEXTSEARCH_BM=m
CONFIG_TEXTSEARCH_FSM=m
CONFIG_PLIST=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [PATCH -mm 1/5] memcg: replace res_counter
  2008-10-21  3:03             ` KAMEZAWA Hiroyuki
@ 2008-10-21  6:30               ` Paul Menage
  0 siblings, 0 replies; 60+ messages in thread
From: Paul Menage @ 2008-10-21  6:30 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Daisuke Nishimura, linux-mm, balbir

On Mon, Oct 20, 2008 at 8:03 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> I think we can't do without performance hit. Considering parent<->child counter,
> parent is busier than child if usage is propergated from child to parent. So,
> prefetch will be just a smal help.

You're right, this argument isn't valid in the case of a parent-child counter.

> I don't like *unsigned long long* just because we have to do following
> =
>   res->usage < *some number*
> =
> or
> =
>   val = res->usage.
> =
> always under lock because usage is unsigned long long.

That's true. But isn't the first case going to be accompanied
generally by an increment, for which you'd need to do an atomic
operation anyway? and the second case is most likely for a read from
userspace which isn't on the fast path.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  6:28               ` Li Zefan
@ 2008-10-21  6:38                 ` Daisuke Nishimura
  0 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-21  6:38 UTC (permalink / raw)
  To: Li Zefan; +Cc: nishimura, KAMEZAWA Hiroyuki, balbir, Paul Menage, linux-mm

On Tue, 21 Oct 2008 14:28:37 +0800, Li Zefan <lizf@cn.fujitsu.com> wrote:
> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 14:20:27 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> >>> BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.
> >>>
> >> But seems this patch causes kernel panic at system boot ... (or maybe one of other
> >> memcg patches?)
> >>
> >> I wrote down the panic manually:
> >>
> >> BUG: unable to handle kernel NULL pointer dereference at 00000000
> >> IP: page_cgroup_zoneinfo + 0xa
> >>
> >> Call Trace:
> >> ? mem_cgroup_charge_common + 0x17d
> >> ? mem_cgroup_charge
> >> ? add_to_page_cache_locked
> >> ? add_to_page_cache_lru
> >> ? find_or_create_page
> >> ? __getblk
> >> ? ext3_get_inode_loc
> >> ? ext3_iget
> >> ? ext3_lookup
> >>
> >> Tell me if you need extra information.
> >>
> > This shows how small testers in -mm ...this is on x86 ?
> 
> Yes, x86_32
> 
> > Could you show me your config ? 
> 
> attached
> 
Hmm... I tested mmotm-2008-10-16-18-58 + memcg update v7 on x86_32 yesterday,
but it worked fine.

I'll try by your config.


Thanks,
Daisuke Nishimura.

> > and what happens if cgroup_disable=memory ?
> > 
> 
> then booted up successfully
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  6:20           ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Li Zefan
  2008-10-21  6:25             ` KAMEZAWA Hiroyuki
@ 2008-10-21  6:54             ` KAMEZAWA Hiroyuki
  2008-10-21  7:04               ` Li Zefan
  1 sibling, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  6:54 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

On Tue, 21 Oct 2008 14:20:27 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> > BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.
> > 
> 
> But seems this patch causes kernel panic at system boot ... (or maybe one of other
> memcg patches?)
> 
> I wrote down the panic manually:
> 
> BUG: unable to handle kernel NULL pointer dereference at 00000000
> IP: page_cgroup_zoneinfo + 0xa
> 
> Call Trace:
> ? mem_cgroup_charge_common + 0x17d
> ? mem_cgroup_charge
> ? add_to_page_cache_locked
> ? add_to_page_cache_lru
> ? find_or_create_page
> ? __getblk
> ? ext3_get_inode_loc
> ? ext3_iget
> ? ext3_lookup
> 
> Tell me if you need extra information.
> 
Hmm, it's curious but page_cgroup->page seems to be NULL...
Could you show disassemble code of page_cgroup_zoneinfo() ?

BTW, this happens always ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  6:54             ` KAMEZAWA Hiroyuki
@ 2008-10-21  7:04               ` Li Zefan
  2008-10-21  7:16                 ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 60+ messages in thread
From: Li Zefan @ 2008-10-21  7:04 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 14:20:27 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>>> BTW, "allocate all page_cgroup at boot" patch goes to Linus' git. Wow.
>>>
>> But seems this patch causes kernel panic at system boot ... (or maybe one of other
>> memcg patches?)
>>
>> I wrote down the panic manually:
>>
>> BUG: unable to handle kernel NULL pointer dereference at 00000000
>> IP: page_cgroup_zoneinfo + 0xa
>>
>> Call Trace:
>> ? mem_cgroup_charge_common + 0x17d
>> ? mem_cgroup_charge
>> ? add_to_page_cache_locked
>> ? add_to_page_cache_lru
>> ? find_or_create_page
>> ? __getblk
>> ? ext3_get_inode_loc
>> ? ext3_iget
>> ? ext3_lookup
>>
>> Tell me if you need extra information.
>>
> Hmm, it's curious but page_cgroup->page seems to be NULL...
> Could you show disassemble code of page_cgroup_zoneinfo() ?
> 

You mean this ?

000881c8 <page_cgroup_zoneinfo>:
   881c8:       55                      push   %ebp
   881c9:       8b 50 04                mov    0x4(%eax),%edx
   881cc:       8b 40 08                mov    0x8(%eax),%eax
   881cf:       89 e5                   mov    %esp,%ebp
   881d1:       5d                      pop    %ebp
   881d2:       8b 00                   mov    (%eax),%eax
   881d4:       c1 e8 1e                shr    $0x1e,%eax
   881d7:       6b c0 58                imul   $0x58,%eax,%eax
   881da:       03 42 48                add    0x48(%edx),%eax
   881dd:       c3                      ret

> BTW, this happens always ?
> 

Sometimes just freezed, and sometimes panic. It never boots successfully.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  7:04               ` Li Zefan
@ 2008-10-21  7:16                 ` KAMEZAWA Hiroyuki
  2008-10-21  7:21                   ` Li Zefan
  0 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  7:16 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

On Tue, 21 Oct 2008 15:04:15 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> You mean this ?
> 
> 000881c8 <page_cgroup_zoneinfo>:
>    881c8:       55                      push   %ebp
>    881c9:       8b 50 04                mov    0x4(%eax),%edx
>    881cc:       8b 40 08                mov    0x8(%eax),%eax
>    881cf:       89 e5                   mov    %esp,%ebp
>    881d1:       5d                      pop    %ebp
>    881d2:       8b 00                   mov    (%eax),%eax
>    881d4:       c1 e8 1e                shr    $0x1e,%eax
>    881d7:       6b c0 58                imul   $0x58,%eax,%eax
>    881da:       03 42 48                add    0x48(%edx),%eax
>    881dd:       c3                      ret
> 
Yes. thank you. This is helpful. From this, page_cgroup->page pointer is NULL.
And page_zid() or some kicks it..

Then, it seems problem is in page_cgroup.c::page_cgroup_init() or
page_cgroup()->page is cleared..Hmm..

could you show /var/log/dmesg ?
It may includes following kinds of line

= (this is x86-64)
sizeof(struct page) = 96
Zone PFN ranges:
  DMA      0x00000000 -> 0x00001000
  DMA32    0x00001000 -> 0x00100000
  Normal   0x00100000 -> 0x00a40000
Movable zone start PFN for each node
early_node_map[4] active PFN ranges
    0: 0x00000000 -> 0x0000009e
    0: 0x00000100 -> 0x000bfee0
    0: 0x000bff00 -> 0x000bff80
    0: 0x00100000 -> 0x00a40000
On node 0 totalpages: 10485502
  DMA zone: 96 pages used for memmap
  DMA zone: 102 pages reserved
  DMA zone: 3800 pages, LIFO batch:0
  DMA32 zone: 24480 pages used for memmap
  DMA32 zone: 757696 pages, LIFO batch:31
  Normal zone: 227328 pages used for memmap
  Normal zone: 9472000 pages, LIFO batch:31
  Movable zone: 0 pages used for memmap
.....

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  7:16                 ` KAMEZAWA Hiroyuki
@ 2008-10-21  7:21                   ` Li Zefan
  2008-10-21  8:18                     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 60+ messages in thread
From: Li Zefan @ 2008-10-21  7:21 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1101 bytes --]

> Yes. thank you. This is helpful. From this, page_cgroup->page pointer is NULL.
> And page_zid() or some kicks it..
> 
> Then, it seems problem is in page_cgroup.c::page_cgroup_init() or
> page_cgroup()->page is cleared..Hmm..
> 
> could you show /var/log/dmesg ?
> It may includes following kinds of line
> 
> = (this is x86-64)
> sizeof(struct page) = 96
> Zone PFN ranges:
>   DMA      0x00000000 -> 0x00001000
>   DMA32    0x00001000 -> 0x00100000
>   Normal   0x00100000 -> 0x00a40000
> Movable zone start PFN for each node
> early_node_map[4] active PFN ranges
>     0: 0x00000000 -> 0x0000009e
>     0: 0x00000100 -> 0x000bfee0
>     0: 0x000bff00 -> 0x000bff80
>     0: 0x00100000 -> 0x00a40000
> On node 0 totalpages: 10485502
>   DMA zone: 96 pages used for memmap
>   DMA zone: 102 pages reserved
>   DMA zone: 3800 pages, LIFO batch:0
>   DMA32 zone: 24480 pages used for memmap
>   DMA32 zone: 757696 pages, LIFO batch:31
>   Normal zone: 227328 pages used for memmap
>   Normal zone: 9472000 pages, LIFO batch:31
>   Movable zone: 0 pages used for memmap
> .....
> 

dmesg is attached.

[-- Attachment #2: dmesg.txt --]
[-- Type: text/plain, Size: 20340 bytes --]

BIOS EBDA/lowmem at: 0009f400/0009f400
Initializing cgroup subsys cpuset
Initializing cgroup subsys cpu
Linux version 2.6.27 (root@localhost.localdomain) (gcc version 4.1.2 20070925 (Red Hat 4.1.2-33)) #296 SMP Tue Oct 21 15:07:29 CST 2008
KERNEL supported cpus:
  Intel GenuineIntel
  AMD AuthenticAMD
  NSC Geode by NSC
  Cyrix CyrixInstead
  Centaur CentaurHauls
  Transmeta GenuineTMx86
  Transmeta TransmetaCPU
  UMC UMC UMC UMC
BIOS-provided physical RAM map:
 BIOS-e820: 0000000000000000 - 000000000009f400 (usable)
 BIOS-e820: 000000000009f400 - 00000000000a0000 (reserved)
 BIOS-e820: 00000000000f0000 - 0000000000100000 (reserved)
 BIOS-e820: 0000000000100000 - 000000003bff0000 (usable)
 BIOS-e820: 000000003bff0000 - 000000003bff3000 (ACPI NVS)
 BIOS-e820: 000000003bff3000 - 000000003c000000 (ACPI data)
 BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved)
DMI 2.3 present.
Phoenix BIOS detected: BIOS may corrupt low RAM, working it around.
last_pfn = 0x3bff0 max_arch_pfn = 0x100000
kernel direct mapping tables up to 373fe000 @ 10000-16000
RAMDISK: 37d11000 - 37fef3e6
Allocated new RAMDISK: 00100000 - 003de3e6
Move RAMDISK from 0000000037d11000 - 0000000037fef3e5 to 00100000 - 003de3e5
ACPI: RSDP 000F7560, 0014 (r0 AWARD )
ACPI: RSDT 3BFF3040, 002C (r1 AWARD  AWRDACPI 42302E31 AWRD        0)
ACPI: FACP 3BFF30C0, 0074 (r1 AWARD  AWRDACPI 42302E31 AWRD        0)
ACPI: DSDT 3BFF3180, 3ABC (r1 AWARD  AWRDACPI     1000 MSFT  100000E)
ACPI: FACS 3BFF0000, 0040
ACPI: APIC 3BFF6C80, 0084 (r1 AWARD  AWRDACPI 42302E31 AWRD        0)
ACPI: DMI detected: Acer
ACPI: Local APIC address 0xfee00000
75MB HIGHMEM available.
883MB LOWMEM available.
  mapped low ram: 0 - 373fe000
  low ram: 00000000 - 373fe000
  bootmap 00012000 - 00018e80
(9 early reservations) ==> bootmem [0000000000 - 00373fe000]
  #0 [0000000000 - 0000001000]   BIOS data page ==> [0000000000 - 0000001000]
  #1 [0000001000 - 0000002000]    EX TRAMPOLINE ==> [0000001000 - 0000002000]
  #2 [0000006000 - 0000007000]       TRAMPOLINE ==> [0000006000 - 0000007000]
  #3 [0000400000 - 0000bce334]    TEXT DATA BSS ==> [0000400000 - 0000bce334]
  #4 [0000bcf000 - 0000bd3000]    INIT_PG_TABLE ==> [0000bcf000 - 0000bd3000]
  #5 [000009f400 - 0000100000]    BIOS reserved ==> [000009f400 - 0000100000]
  #6 [0000010000 - 0000012000]          PGTABLE ==> [0000010000 - 0000012000]
  #7 [0000100000 - 00003de3e6]      NEW RAMDISK ==> [0000100000 - 00003de3e6]
  #8 [0000012000 - 0000019000]          BOOTMAP ==> [0000012000 - 0000019000]
found SMP MP-table at [c00f5ad0] 000f5ad0
Zone PFN ranges:
  DMA      0x00000010 -> 0x00001000
  Normal   0x00001000 -> 0x000373fe
  HighMem  0x000373fe -> 0x0003bff0
Movable zone start PFN for each node
early_node_map[2] active PFN ranges
    0: 0x00000010 -> 0x0000009f
    0: 0x00000100 -> 0x0003bff0
On node 0 totalpages: 245631
free_area_init_node: node 0, pgdat c0731a00, node_mem_map c1000340
  DMA zone: 52 pages used for memmap
  DMA zone: 0 pages reserved
  DMA zone: 3931 pages, LIFO batch:0
  Normal zone: 2821 pages used for memmap
  Normal zone: 219385 pages, LIFO batch:31
  HighMem zone: 247 pages used for memmap
  HighMem zone: 19195 pages, LIFO batch:3
  Movable zone: 0 pages used for memmap
Using APIC driver default
ACPI: PM-Timer IO Port: 0x1008
ACPI: Local APIC address 0xfee00000
ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
ACPI: LAPIC (acpi_id[0x02] lapic_id[0x02] disabled)
ACPI: LAPIC (acpi_id[0x03] lapic_id[0x03] disabled)
ACPI: LAPIC_NMI (acpi_id[0x00] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x01] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x02] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x03] high edge lint[0x1])
ACPI: IOAPIC (id[0x04] address[0xfec00000] gsi_base[0])
IOAPIC[0]: apic_id 4, version 17, address 0xfec00000, GSI 0-23
ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 dfl dfl)
ACPI: IRQ0 used by override.
ACPI: IRQ2 used by override.
ACPI: IRQ9 used by override.
Enabling APIC mode:  Flat.  Using 1 I/O APICs
Using ACPI (MADT) for SMP configuration information
SMP: Allowing 4 CPUs, 2 hotplug CPUs
mapped APIC to ffffb000 (fee00000)
mapped IOAPIC to ffffa000 (fec00000)
PM: Registered nosave memory: 000000000009f000 - 00000000000a0000
PM: Registered nosave memory: 00000000000a0000 - 00000000000f0000
PM: Registered nosave memory: 00000000000f0000 - 0000000000100000
Allocating PCI resources starting at 40000000 (gap: 3c000000:c2c00000)
PERCPU: Allocating 32796 bytes of per cpu data
NR_CPUS: 32, nr_cpu_ids: 4, nr_node_ids 1
Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 242511
Kernel command line: ro root=LABEL=/ rhgb quiet cgroup_disable=memory
Disabling memory control group subsystem
Enabling fast FPU save and restore... done.
Enabling unmasked SIMD FPU exception support... done.
Initializing CPU#0
CPU 0 irqstacks, hard=c07c2000 soft=c07a2000
PID hash table entries: 4096 (order: 12, 16384 bytes)
Fast TSC calibration using PIT
Detected 2800.135 MHz processor.
Console: colour VGA+ 80x25
console [tty0] enabled
Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
... MAX_LOCKDEP_SUBCLASSES:    8
... MAX_LOCK_DEPTH:          48
... MAX_LOCKDEP_KEYS:        8191
... CLASSHASH_SIZE:           4096
... MAX_LOCKDEP_ENTRIES:     8192
... MAX_LOCKDEP_CHAINS:      16384
... CHAINHASH_SIZE:          8192
 memory used by lock dependency info: 2335 kB
 per task-struct memory footprint: 1152 bytes
Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
Memory: 957788k/982976k available (2113k kernel code, 24500k reserved, 1262k data, 312k init, 77768k highmem)
virtual kernel memory layout:
    fixmap  : 0xffc58000 - 0xfffff000   (3740 kB)
    pkmap   : 0xff400000 - 0xff800000   (4096 kB)
    vmalloc : 0xf7bfe000 - 0xff3fe000   ( 120 MB)
    lowmem  : 0xc0000000 - 0xf73fe000   ( 883 MB)
      .init : 0xc0751000 - 0xc079f000   ( 312 kB)
      .data : 0xc0610798 - 0xc074c218   (1262 kB)
      .text : 0xc0400000 - 0xc0610798   (2113 kB)
Checking if this processor honours the WP bit even in supervisor mode...Ok.
SLUB: Genslabs=12, HWalign=128, Order=0-3, MinObjects=0, CPUs=4, Nodes=1
Calibrating delay loop (skipped), value calculated using timer frequency.. 5600.27 BogoMIPS (lpj=2800135)
Mount-cache hash table entries: 512
Initializing cgroup subsys debug
Initializing cgroup subsys ns
Initializing cgroup subsys cpuacct
Initializing cgroup subsys memory
allocated 4914560 bytes of page_cgroup
please try cgroup_disable=memory option if you don't want
Initializing cgroup subsys devices
Initializing cgroup subsys freezer
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 1024K
CPU: Physical Processor ID: 0
CPU: Processor Core ID: 0
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#0.
CPU0: Intel P4/Xeon Extended MCE MSRs (24) available
CPU0: Thermal monitoring enabled
using mwait in idle threads.
Checking 'hlt' instruction... OK.
ACPI: Core revision 20080609
ENABLING IO-APIC IRQs
..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1
CPU0: Intel(R) Pentium(R) D CPU 2.80GHz stepping 04
lockdep: fixing up alternatives.
CPU 1 irqstacks, hard=c07c3000 soft=c07a3000
Booting processor 1/1 ip 6000
Initializing CPU#1
Calibrating delay using timer specific routine.. 5599.27 BogoMIPS (lpj=2799635)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 1024K
CPU: Physical Processor ID: 0
CPU: Processor Core ID: 1
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#1.
CPU1: Intel P4/Xeon Extended MCE MSRs (24) available
CPU1: Thermal monitoring enabled
CPU1: Intel(R) Pentium(R) D CPU 2.80GHz stepping 04
checking TSC synchronization [CPU#0 -> CPU#1]: passed.
Brought up 2 CPUs
Total of 2 processors activated (11199.54 BogoMIPS).
CPU0 attaching sched-domain:
 domain 0: span 0-1 level CPU
  groups: 0 1
CPU1 attaching sched-domain:
 domain 0: span 0-1 level CPU
  groups: 1 0
net_namespace: 384 bytes
NET: Registered protocol family 16
No dock devices found.
ACPI: bus type pci registered
PCI: PCI BIOS revision 2.10 entry at 0xfbda0, last bus=1
PCI: Using configuration type 1 for base access
mtrr: your CPUs had inconsistent fixed MTRR settings
mtrr: probably your BIOS does not setup all CPUs.
mtrr: corrected configuration.
ACPI: EC: Look up EC in DSDT
ACPI: Interpreter enabled
ACPI: (supports S0 S3 S4 S5)
ACPI: Using IOAPIC for interrupt routing
ACPI: PCI Root Bridge [PCI0] (0000:00)
PCI: 0000:00:00.0 reg 10 32bit mmio: [0xd0000000-0xd7ffffff]
PCI: 0000:00:02.5 reg 10 io port: [0x1f0-0x1f7]
PCI: 0000:00:02.5 reg 14 io port: [0x3f4-0x3f7]
PCI: 0000:00:02.5 reg 18 io port: [0x170-0x177]
PCI: 0000:00:02.5 reg 1c io port: [0x374-0x377]
PCI: 0000:00:02.5 reg 20 io port: [0x4000-0x400f]
pci 0000:00:02.5: PME# supported from D3cold
pci 0000:00:02.5: PME# disabled
PCI: 0000:00:02.7 reg 10 io port: [0xd000-0xd0ff]
PCI: 0000:00:02.7 reg 14 io port: [0xd400-0xd47f]
pci 0000:00:02.7: supports D1
pci 0000:00:02.7: supports D2
pci 0000:00:02.7: PME# supported from D3hot D3cold
pci 0000:00:02.7: PME# disabled
PCI: 0000:00:03.0 reg 10 32bit mmio: [0xe1104000-0xe1104fff]
PCI: 0000:00:03.1 reg 10 32bit mmio: [0xe1100000-0xe1100fff]
PCI: 0000:00:03.2 reg 10 32bit mmio: [0xe1101000-0xe1101fff]
PCI: 0000:00:03.3 reg 10 32bit mmio: [0xe1102000-0xe1102fff]
pci 0000:00:03.3: PME# supported from D0 D3hot D3cold
pci 0000:00:03.3: PME# disabled
PCI: 0000:00:05.0 reg 10 io port: [0xd800-0xd807]
PCI: 0000:00:05.0 reg 14 io port: [0xdc00-0xdc03]
PCI: 0000:00:05.0 reg 18 io port: [0xe000-0xe007]
PCI: 0000:00:05.0 reg 1c io port: [0xe400-0xe403]
PCI: 0000:00:05.0 reg 20 io port: [0xe800-0xe80f]
pci 0000:00:05.0: PME# supported from D3cold
pci 0000:00:05.0: PME# disabled
PCI: 0000:00:0e.0 reg 10 io port: [0xec00-0xecff]
PCI: 0000:00:0e.0 reg 14 32bit mmio: [0xe1103000-0xe11030ff]
PCI: 0000:00:0e.0 reg 30 32bit mmio: [0x000000-0x01ffff]
pci 0000:00:0e.0: supports D1
pci 0000:00:0e.0: supports D2
pci 0000:00:0e.0: PME# supported from D1 D2 D3hot D3cold
pci 0000:00:0e.0: PME# disabled
PCI: 0000:01:00.0 reg 10 32bit mmio: [0xd8000000-0xdfffffff]
PCI: 0000:01:00.0 reg 14 32bit mmio: [0xe1000000-0xe101ffff]
PCI: 0000:01:00.0 reg 18 io port: [0xc000-0xc07f]
pci 0000:01:00.0: supports D1
pci 0000:01:00.0: supports D2
PCI: bridge 0000:00:01.0 io port: [0xc000-0xcfff]
PCI: bridge 0000:00:01.0 32bit mmio: [0xe1000000-0xe10fffff]
PCI: bridge 0000:00:01.0 32bit mmio pref: [0xd8000000-0xdfffffff]
bus 00 -> node 0
ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
ACPI: PCI Interrupt Link [LNKA] (IRQs 3 4 5 6 7 9 10 11 14 15) *0, disabled.
ACPI: PCI Interrupt Link [LNKB] (IRQs 3 4 5 6 7 9 10 *11 14 15)
ACPI: PCI Interrupt Link [LNKC] (IRQs 3 4 5 6 7 9 *10 11 14 15)
ACPI: PCI Interrupt Link [LNKD] (IRQs 3 4 5 6 7 9 10 11 14 15) *0, disabled.
ACPI: PCI Interrupt Link [LNKE] (IRQs 3 4 5 6 7 9 10 *11 14 15)
ACPI: PCI Interrupt Link [LNKF] (IRQs 3 4 5 *6 7 9 10 11 14 15)
ACPI: PCI Interrupt Link [LNKG] (IRQs 3 4 5 6 7 *9 10 11 14 15)
ACPI: PCI Interrupt Link [LNKH] (IRQs 3 4 *5 6 7 9 10 11 14 15)
usbcore: registered new interface driver usbfs
usbcore: registered new interface driver hub
usbcore: registered new device driver usb
PCI: Using ACPI for IRQ routing
pnp: PnP ACPI init
ACPI: bus type pnp registered
pnp: PnP ACPI: found 12 devices
ACPI: ACPI bus type pnp unregistered
system 00:00: iomem range 0xc8000-0xcbfff has been reserved
system 00:00: iomem range 0xf0000-0xf7fff could not be reserved
system 00:00: iomem range 0xf8000-0xfbfff could not be reserved
system 00:00: iomem range 0xfc000-0xfffff could not be reserved
system 00:00: iomem range 0x3bff0000-0x3bffffff could not be reserved
system 00:00: iomem range 0xffff0000-0xffffffff could not be reserved
system 00:00: iomem range 0x0-0x9ffff could not be reserved
system 00:00: iomem range 0x100000-0x3bfeffff could not be reserved
system 00:00: iomem range 0xffee0000-0xffefffff could not be reserved
system 00:00: iomem range 0xfffe0000-0xfffeffff could not be reserved
system 00:00: iomem range 0xfec00000-0xfecfffff could not be reserved
system 00:00: iomem range 0xfee00000-0xfeefffff could not be reserved
system 00:02: ioport range 0x4d0-0x4d1 has been reserved
system 00:02: ioport range 0x800-0x805 has been reserved
system 00:02: ioport range 0x290-0x297 has been reserved
system 00:02: ioport range 0x880-0x88f has been reserved
pci 0000:00:01.0: PCI bridge, secondary bus 0000:01
pci 0000:00:01.0:   IO window: 0xc000-0xcfff
pci 0000:00:01.0:   MEM window: 0xe1000000-0xe10fffff
pci 0000:00:01.0:   PREFETCH window: 0x000000d8000000-0x000000dfffffff
bus: 00 index 0 io port: [0x00-0xffff]
bus: 00 index 1 mmio: [0x000000-0xffffffff]
bus: 01 index 0 io port: [0xc000-0xcfff]
bus: 01 index 1 mmio: [0xe1000000-0xe10fffff]
bus: 01 index 2 mmio: [0xd8000000-0xdfffffff]
bus: 01 index 3 mmio: [0x0-0x0]
NET: Registered protocol family 2
IP route cache hash table entries: 32768 (order: 5, 131072 bytes)
TCP established hash table entries: 131072 (order: 8, 1048576 bytes)
TCP bind hash table entries: 65536 (order: 9, 2097152 bytes)
TCP: Hash tables configured (established 131072 bind 65536)
TCP reno registered
NET: Registered protocol family 1
checking if image is initramfs... it is
Freeing initrd memory: 2936k freed
apm: BIOS version 1.2 Flags 0x07 (Driver version 1.16ac)
apm: disabled - APM is not SMP safe.
audit: initializing netlink socket (disabled)
type=2000 audit(1224601943.564:1): initialized
highmem bounce pool size: 64 pages
HugeTLB registered 4 MB page size, pre-allocated 0 pages
msgmni has been set to 1724
alg: No test for stdrng (krng)
Block layer SCSI generic (bsg) driver version 0.4 loaded (major 253)
io scheduler noop registered
io scheduler cfq registered (default)
pci 0000:01:00.0: Boot video device
pci_hotplug: PCI Hot Plug PCI Core version: 0.5
fan PNP0C0B:00: registered as cooling_device0
ACPI: Fan [FAN] (on)
processor ACPI0007:00: registered as cooling_device1
processor ACPI0007:01: registered as cooling_device2
thermal LNXTHERM:01: registered as thermal_zone0
ACPI: Thermal Zone [THRM] (56 C)
isapnp: Scanning for PnP cards...
Switched to high resolution mode on CPU 1
Switched to high resolution mode on CPU 0
isapnp: No Plug & Play device found
Real Time Clock Driver v1.12ac
Non-volatile memory driver v1.2
Linux agpgart interface v0.103
agpgart-sis 0000:00:00.0: SiS chipset [1039/0661]
agpgart-sis 0000:00:00.0: AGP aperture is 128M @ 0xd0000000
Serial: 8250/16550 driver4 ports, IRQ sharing enabled
serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
serial8250: ttyS1 at I/O 0x2f8 (irq = 3) is a 16550A
00:07: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
00:08: ttyS1 at I/O 0x2f8 (irq = 3) is a 16550A
brd: module loaded
PNP: PS/2 Controller [PNP0303:PS2K,PNP0f13:PS2M] at 0x60,0x64 irq 1,12
serio: i8042 KBD port at 0x60,0x64 irq 1
serio: i8042 AUX port at 0x60,0x64 irq 12
mice: PS/2 mouse device common for all mice
cpuidle: using governor ladder
cpuidle: using governor menu
usbcore: registered new interface driver hiddev
usbcore: registered new interface driver usbhid
usbhid: v2.6:USB HID core driver
TCP cubic registered
NET: Registered protocol family 17
Using IPI No-Shortcut mode
registered taskstats version 1
Freeing unused kernel memory: 312k freed
Write protecting the kernel text: 2116k
Write protecting the kernel read-only data: 996k
ehci_hcd: USB 2.0 'Enhanced' Host Controller (EHCI) Driver
ehci_hcd 0000:00:03.3: PCI INT D -> GSI 23 (level, low) -> IRQ 23
ehci_hcd 0000:00:03.3: EHCI Host Controller
ehci_hcd 0000:00:03.3: new USB bus registered, assigned bus number 1
ehci_hcd 0000:00:03.3: cache line size of 128 is not supported
ehci_hcd 0000:00:03.3: irq 23, io mem 0xe1102000
ehci_hcd 0000:00:03.3: USB 2.0 started, EHCI 1.00
usb usb1: configuration #1 chosen from 1 choice
hub 1-0:1.0: USB hub found
hub 1-0:1.0: 8 ports detected
ohci_hcd: USB 1.1 'Open' Host Controller (OHCI) Driver
ohci_hcd 0000:00:03.0: PCI INT A -> GSI 20 (level, low) -> IRQ 20
ohci_hcd 0000:00:03.0: OHCI Host Controller
ohci_hcd 0000:00:03.0: new USB bus registered, assigned bus number 2
ohci_hcd 0000:00:03.0: irq 20, io mem 0xe1104000
usb usb2: configuration #1 chosen from 1 choice
hub 2-0:1.0: USB hub found
hub 2-0:1.0: 3 ports detected
ohci_hcd 0000:00:03.1: PCI INT B -> GSI 21 (level, low) -> IRQ 21
ohci_hcd 0000:00:03.1: OHCI Host Controller
ohci_hcd 0000:00:03.1: new USB bus registered, assigned bus number 3
ohci_hcd 0000:00:03.1: irq 21, io mem 0xe1100000
usb usb3: configuration #1 chosen from 1 choice
hub 3-0:1.0: USB hub found
hub 3-0:1.0: 3 ports detected
ohci_hcd 0000:00:03.2: PCI INT C -> GSI 22 (level, low) -> IRQ 22
ohci_hcd 0000:00:03.2: OHCI Host Controller
ohci_hcd 0000:00:03.2: new USB bus registered, assigned bus number 4
ohci_hcd 0000:00:03.2: irq 22, io mem 0xe1101000
usb usb4: configuration #1 chosen from 1 choice
hub 4-0:1.0: USB hub found
hub 4-0:1.0: 2 ports detected
uhci_hcd: USB Universal Host Controller Interface driver
SCSI subsystem initialized
Driver 'sd' needs updating - please use bus_type methods
libata version 3.00 loaded.
pata_sis 0000:00:02.5: version 0.5.2
pata_sis 0000:00:02.5: PCI INT A -> GSI 16 (level, low) -> IRQ 16
scsi0 : pata_sis
scsi1 : pata_sis
ata1: PATA max UDMA/133 cmd 0x1f0 ctl 0x3f6 bmdma 0x4000 irq 14
ata2: PATA max UDMA/133 cmd 0x170 ctl 0x376 bmdma 0x4008 irq 15
input: ImPS/2 Logitech Wheel Mouse as /class/input/input0
input: AT Translated Set 2 keyboard as /class/input/input1
sata_sis 0000:00:05.0: version 1.0
sata_sis 0000:00:05.0: PCI INT A -> GSI 17 (level, low) -> IRQ 17
sata_sis 0000:00:05.0: Detected SiS 180/181/964 chipset in SATA mode
scsi2 : sata_sis
scsi3 : sata_sis
ata3: SATA max UDMA/133 cmd 0xd800 ctl 0xdc00 bmdma 0xe800 irq 17
ata4: SATA max UDMA/133 cmd 0xe000 ctl 0xe400 bmdma 0xe808 irq 17
ata3: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
ata3.00: ATA-7: ST3808110AS, 3.AAE, max UDMA/133
ata3.00: 156301488 sectors, multi 16: LBA48 NCQ (depth 0/32)
ata3.00: configured for UDMA/133
ata4: SATA link down (SStatus 0 SControl 300)
scsi 2:0:0:0: Direct-Access     ATA      ST3808110AS      3.AA PQ: 0 ANSI: 5
sd 2:0:0:0: [sda] 156301488 512-byte hardware sectors: (80.0GB/74.5GiB)
sd 2:0:0:0: [sda] Write Protect is off
sd 2:0:0:0: [sda] Mode Sense: 00 3a 00 00
sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
sd 2:0:0:0: [sda] 156301488 512-byte hardware sectors: (80.0GB/74.5GiB)
sd 2:0:0:0: [sda] Write Protect is off
sd 2:0:0:0: [sda] Mode Sense: 00 3a 00 00
sd 2:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
 sda: sda1 sda2 < sda5 sda6 sda7 sda8 sda9 >
sd 2:0:0:0: [sda] Attached SCSI disk
kjournald starting.  Commit interval 5 seconds
EXT3-fs: mounted filesystem with ordered data mode.
input: Power Button (FF) as /class/input/input2
ACPI: Power Button (FF) [PWRF]
input: Power Button (CM) as /class/input/input3
ACPI: Power Button (CM) [PWRB]
input: Sleep Button (CM) as /class/input/input4
ACPI: Sleep Button (CM) [FUTS]
sd 2:0:0:0: Attached scsi generic sg0 type 0
r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
r8169 0000:00:0e.0: PCI INT A -> GSI 18 (level, low) -> IRQ 18
r8169 0000:00:0e.0: no PCI Express capability
eth0: RTL8110s at 0xf7fd8000, 00:16:ec:2e:b7:e0, XID 04000000 IRQ 18
parport_pc 00:09: reported by Plug and Play ACPI
parport0: PC-style at 0x378 (0x778), irq 7 [PCSPP,TRISTATE]
device-mapper: ioctl: 4.14.0-ioctl (2008-04-23) initialised: dm-devel@redhat.com
EXT3 FS on sda9, internal journal
kjournald starting.  Commit interval 5 seconds
EXT3 FS on sda8, internal journal
EXT3-fs: mounted filesystem with ordered data mode.
Adding 1052216k swap on /dev/sda7.  Priority:-1 extents:1 across:1052216k
warning: process `kudzu' used the deprecated sysctl system call with 1.23.
r8169: eth0: link up
r8169: eth0: link up
warning: `dbus-daemon' uses 32-bit capabilities (legacy support in use)
virbr0: Dropping NETIF_F_UFO since no NETIF_F_HW_CSUM feature.
CPU0 attaching NULL sched-domain.
CPU1 attaching NULL sched-domain.
CPU0 attaching sched-domain:
 domain 0: span 0-1 level CPU
  groups: 0 1
CPU1 attaching sched-domain:
 domain 0: span 0-1 level CPU
  groups: 1 0

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  7:21                   ` Li Zefan
@ 2008-10-21  8:18                     ` KAMEZAWA Hiroyuki
  2008-10-21  8:34                       ` Mel Gorman
  2008-10-21  8:35                       ` Li Zefan
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  8:18 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 15:21:07 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:
> dmesg is attached.
> 
Thanks....I think I caught some. (added Mel Gorman to CC:)

NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.

So, If there is a hole between zone, node->spanned_pages doesn't mean
length of node's memmap....(then, some hole can be skipped.)

OMG....Could you try this ? 

-Kame
==
NODE_DATA(nid)->node_spanned_pages doesn't means width of node's memory
but means sum of spanned_pages in all zones of node.

alloc_node_page_cgroup() misunderstand it. This patch tries to use
the same algorithm as alloc_node_mem_map() for allocating page_cgroup()
for node.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 mm/page_cgroup.c |   17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

Index: linux-2.6.27/mm/page_cgroup.c
===================================================================
--- linux-2.6.27.orig/mm/page_cgroup.c
+++ linux-2.6.27/mm/page_cgroup.c
@@ -41,10 +41,18 @@ static int __init alloc_node_page_cgroup
 {
 	struct page_cgroup *base, *pc;
 	unsigned long table_size;
-	unsigned long start_pfn, nr_pages, index;
+	unsigned long start, end, start_pfn, nr_pages, index;
 
-	start_pfn = NODE_DATA(nid)->node_start_pfn;
-	nr_pages = NODE_DATA(nid)->node_spanned_pages;
+	/*
+	 * Instead of allocating page_cgroup for [start, end)
+	 * We allocate page_cgroup to the same size of mem_map.
+	 * See page_alloc.c::alloc_node_mem_map()
+	 */
+	start = NODE_DATA(nid)->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+	end = NODE_DATA(nid)->node_start_pfn
+			+ NODE_DATA(nid)->node_spanned_pages;
+	end = ALIGN(end, MAX_ORDER_NR_PAGES);
+	nr_pages = end - start;
 
 	table_size = sizeof(struct page_cgroup) * nr_pages;
 
@@ -52,6 +60,9 @@ static int __init alloc_node_page_cgroup
 			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (!base)
 		return -ENOMEM;
+
+	start_pfn = NODE_DATA(nid)->node_start_pfn;
+	base = base + start_pfn - start;
 	for (index = 0; index < nr_pages; index++) {
 		pc = base + index;
 		__init_page_cgroup(pc, start_pfn + index);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:18                     ` KAMEZAWA Hiroyuki
@ 2008-10-21  8:34                       ` Mel Gorman
  2008-10-21  8:38                         ` KAMEZAWA Hiroyuki
  2008-10-21  8:35                       ` Li Zefan
  1 sibling, 1 reply; 60+ messages in thread
From: Mel Gorman @ 2008-10-21  8:34 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm

On Tue, Oct 21, 2008 at 05:18:01PM +0900, KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 15:21:07 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> > dmesg is attached.
> > 
> Thanks....I think I caught some. (added Mel Gorman to CC:)
> 
> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> 
> So, If there is a hole between zone, node->spanned_pages doesn't mean
> length of node's memmap....(then, some hole can be skipped.)
> 

This is correct. pgdat->node_spanned_pages is the range of PFNs the node
covers. In some cases, this can even overlap other nodes. There can be
memory holes and there is no guarantee there is memmap present for the holes.
The number of actual pages is pgdat->node_present_pages.

> OMG....Could you try this ? 
> 
> -Kame
> ==
> NODE_DATA(nid)->node_spanned_pages doesn't means width of node's memory
> but means sum of spanned_pages in all zones of node.
> 

Does not necessarily mean that either. Conceivably there could be gaps
between the zones.

> alloc_node_page_cgroup() misunderstand it. This patch tries to use
> the same algorithm as alloc_node_mem_map() for allocating page_cgroup()
> for node.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
>  mm/page_cgroup.c |   17 ++++++++++++++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
> 
> Index: linux-2.6.27/mm/page_cgroup.c
> ===================================================================
> --- linux-2.6.27.orig/mm/page_cgroup.c
> +++ linux-2.6.27/mm/page_cgroup.c
> @@ -41,10 +41,18 @@ static int __init alloc_node_page_cgroup
>  {
>  	struct page_cgroup *base, *pc;
>  	unsigned long table_size;
> -	unsigned long start_pfn, nr_pages, index;
> +	unsigned long start, end, start_pfn, nr_pages, index;
>  
> -	start_pfn = NODE_DATA(nid)->node_start_pfn;
> -	nr_pages = NODE_DATA(nid)->node_spanned_pages;
> +	/*
> +	 * Instead of allocating page_cgroup for [start, end)
> +	 * We allocate page_cgroup to the same size of mem_map.
> +	 * See page_alloc.c::alloc_node_mem_map()
> +	 */
> +	start = NODE_DATA(nid)->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
> +	end = NODE_DATA(nid)->node_start_pfn
> +			+ NODE_DATA(nid)->node_spanned_pages;
> +	end = ALIGN(end, MAX_ORDER_NR_PAGES);
> +	nr_pages = end - start;
>  

I don't know what this function is doing, but that will calculate nr_pages
to be the full width of a node, holes and all which is what I think you're
trying to do. Again, remember this could cover another node as you can have
a situation where the pfn ranges are

      node1_pages   |   node0_pages	|  node1_pages
start <---------------------------------------------->end

Maybe this is not a problem for you. It all depends on how you map a PFN
to a table. There is also a concern for memory usage as;

>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>  

this is potentially a very large table.

> @@ -52,6 +60,9 @@ static int __init alloc_node_page_cgroup
>  			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>  	if (!base)
>  		return -ENOMEM;
> +
> +	start_pfn = NODE_DATA(nid)->node_start_pfn;
> +	base = base + start_pfn - start;
>  	for (index = 0; index < nr_pages; index++) {
>  		pc = base + index;
>  		__init_page_cgroup(pc, start_pfn + index);
> 

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:18                     ` KAMEZAWA Hiroyuki
  2008-10-21  8:34                       ` Mel Gorman
@ 2008-10-21  8:35                       ` Li Zefan
  2008-10-21  8:36                         ` KAMEZAWA Hiroyuki
  2008-10-21  8:57                         ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 60+ messages in thread
From: Li Zefan @ 2008-10-21  8:35 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 15:21:07 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
>> dmesg is attached.
>>
> Thanks....I think I caught some. (added Mel Gorman to CC:)
> 
> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> 
> So, If there is a hole between zone, node->spanned_pages doesn't mean
> length of node's memmap....(then, some hole can be skipped.)
> 
> OMG....Could you try this ? 
> 

No luck, the same bug still exists. :(

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:35                       ` Li Zefan
@ 2008-10-21  8:36                         ` KAMEZAWA Hiroyuki
  2008-10-21  8:57                         ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  8:36 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 16:35:09 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 15:21:07 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> >> dmesg is attached.
> >>
> > Thanks....I think I caught some. (added Mel Gorman to CC:)
> > 
> > NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > 
> > So, If there is a hole between zone, node->spanned_pages doesn't mean
> > length of node's memmap....(then, some hole can be skipped.)
> > 
> > OMG....Could you try this ? 
> > 
> 
> No luck, the same bug still exists. :(
> 
Thank you...

-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:34                       ` Mel Gorman
@ 2008-10-21  8:38                         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  8:38 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm

On Tue, 21 Oct 2008 09:34:54 +0100
mel@skynet.ie (Mel Gorman) wrote:

> On Tue, Oct 21, 2008 at 05:18:01PM +0900, KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 15:21:07 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > dmesg is attached.
> > > 
> > Thanks....I think I caught some. (added Mel Gorman to CC:)
> > 
> > NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > 
> > So, If there is a hole between zone, node->spanned_pages doesn't mean
> > length of node's memmap....(then, some hole can be skipped.)
> > 
> 
> This is correct. pgdat->node_spanned_pages is the range of PFNs the node
> covers. In some cases, this can even overlap other nodes. There can be
> memory holes and there is no guarantee there is memmap present for the holes.
> The number of actual pages is pgdat->node_present_pages.
> 

Thank you for clarification.

> > OMG....Could you try this ? 
> > 
> > -Kame
> > ==
> > NODE_DATA(nid)->node_spanned_pages doesn't means width of node's memory
> > but means sum of spanned_pages in all zones of node.
> > 
> 
> Does not necessarily mean that either. Conceivably there could be gaps
> between the zones.
> 
I see.

> > alloc_node_page_cgroup() misunderstand it. This patch tries to use
> > the same algorithm as alloc_node_mem_map() for allocating page_cgroup()
> > for node.
> > 
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > 
> >  mm/page_cgroup.c |   17 ++++++++++++++---
> >  1 file changed, 14 insertions(+), 3 deletions(-)
> > 
> > Index: linux-2.6.27/mm/page_cgroup.c
> > ===================================================================
> > --- linux-2.6.27.orig/mm/page_cgroup.c
> > +++ linux-2.6.27/mm/page_cgroup.c
> > @@ -41,10 +41,18 @@ static int __init alloc_node_page_cgroup
> >  {
> >  	struct page_cgroup *base, *pc;
> >  	unsigned long table_size;
> > -	unsigned long start_pfn, nr_pages, index;
> > +	unsigned long start, end, start_pfn, nr_pages, index;
> >  
> > -	start_pfn = NODE_DATA(nid)->node_start_pfn;
> > -	nr_pages = NODE_DATA(nid)->node_spanned_pages;
> > +	/*
> > +	 * Instead of allocating page_cgroup for [start, end)
> > +	 * We allocate page_cgroup to the same size of mem_map.
> > +	 * See page_alloc.c::alloc_node_mem_map()
> > +	 */
> > +	start = NODE_DATA(nid)->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
> > +	end = NODE_DATA(nid)->node_start_pfn
> > +			+ NODE_DATA(nid)->node_spanned_pages;
> > +	end = ALIGN(end, MAX_ORDER_NR_PAGES);
> > +	nr_pages = end - start;
> >  
> 
> I don't know what this function is doing, but that will calculate nr_pages
> to be the full width of a node, holes and all which is what I think you're
> trying to do. Again, remember this could cover another node as you can have
> a situation where the pfn ranges are
> 
>       node1_pages   |   node0_pages	|  node1_pages
> start <---------------------------------------------->end
> 
> Maybe this is not a problem for you. It all depends on how you map a PFN
> to a table. There is also a concern for memory usage as;
> 
> >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> >  
> 
> this is potentially a very large table.
> 

yes. I know. usual big-address-space people will use SPARSEMEM version.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:35                       ` Li Zefan
  2008-10-21  8:36                         ` KAMEZAWA Hiroyuki
@ 2008-10-21  8:57                         ` KAMEZAWA Hiroyuki
  2008-10-21  9:13                           ` Li Zefan
  2008-10-21  9:33                           ` Daisuke Nishimura
  1 sibling, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  8:57 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 16:35:09 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 15:21:07 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> >> dmesg is attached.
> >>
> > Thanks....I think I caught some. (added Mel Gorman to CC:)
> > 
> > NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > 
> > So, If there is a hole between zone, node->spanned_pages doesn't mean
> > length of node's memmap....(then, some hole can be skipped.)
> > 
> > OMG....Could you try this ? 
> > 
> 
> No luck, the same bug still exists. :(
> 
This is a little fixed one..

please..
-Kame
==
NODE_DATA(nid)->node_spanned_pages doesn't means width of node's memory.

alloc_node_page_cgroup() misunderstand it. This patch tries to use
the same algorithm as alloc_node_mem_map() for allocating page_cgroup()
for node.

Changelog:
 - fixed range of initialization loop.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 mm/page_cgroup.c |   19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

Index: linux-2.6.27/mm/page_cgroup.c
===================================================================
--- linux-2.6.27.orig/mm/page_cgroup.c
+++ linux-2.6.27/mm/page_cgroup.c
@@ -9,6 +9,8 @@
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
 {
+	if (!pfn_valid(pfn))
+		return;
 	pc->flags = 0;
 	pc->mem_cgroup = NULL;
 	pc->page = pfn_to_page(pfn);
@@ -41,10 +43,18 @@ static int __init alloc_node_page_cgroup
 {
 	struct page_cgroup *base, *pc;
 	unsigned long table_size;
-	unsigned long start_pfn, nr_pages, index;
+	unsigned long start, end, start_pfn, nr_pages, index;
 
+	/*
+	 * Instead of allocating page_cgroup for [start, end)
+	 * We allocate page_cgroup to the same size of mem_map.
+	 * See page_alloc.c::alloc_node_mem_map()
+	 */
 	start_pfn = NODE_DATA(nid)->node_start_pfn;
-	nr_pages = NODE_DATA(nid)->node_spanned_pages;
+	start = start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+	end = start_pfn	+ NODE_DATA(nid)->node_spanned_pages;
+	end = ALIGN(end, MAX_ORDER_NR_PAGES);
+	nr_pages = end - start;
 
 	table_size = sizeof(struct page_cgroup) * nr_pages;
 
@@ -52,11 +62,12 @@ static int __init alloc_node_page_cgroup
 			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (!base)
 		return -ENOMEM;
+
 	for (index = 0; index < nr_pages; index++) {
 		pc = base + index;
-		__init_page_cgroup(pc, start_pfn + index);
+		__init_page_cgroup(pc, start + index);
 	}
-	NODE_DATA(nid)->node_page_cgroup = base;
+	NODE_DATA(nid)->node_page_cgroup = base + start_pfn - start;
 	total_usage += table_size;
 	return 0;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:57                         ` KAMEZAWA Hiroyuki
@ 2008-10-21  9:13                           ` Li Zefan
  2008-10-21  9:25                             ` KAMEZAWA Hiroyuki
  2008-10-21  9:33                           ` Daisuke Nishimura
  1 sibling, 1 reply; 60+ messages in thread
From: Li Zefan @ 2008-10-21  9:13 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 16:35:09 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Tue, 21 Oct 2008 15:21:07 +0800
>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>> dmesg is attached.
>>>>
>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
>>>
>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
>>>
>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
>>> length of node's memmap....(then, some hole can be skipped.)
>>>
>>> OMG....Could you try this ? 
>>>
>> No luck, the same bug still exists. :(
>>
> This is a little fixed one..
> 

I tried the patch, but it doesn't solve the problem..

> please..
> -Kame
> ==
> NODE_DATA(nid)->node_spanned_pages doesn't means width of node's memory.
> 
> alloc_node_page_cgroup() misunderstand it. This patch tries to use
> the same algorithm as alloc_node_mem_map() for allocating page_cgroup()
> for node.
> 
> Changelog:
>  - fixed range of initialization loop.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
>  mm/page_cgroup.c |   19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6.27/mm/page_cgroup.c
> ===================================================================
> --- linux-2.6.27.orig/mm/page_cgroup.c
> +++ linux-2.6.27/mm/page_cgroup.c
> @@ -9,6 +9,8 @@
>  static void __meminit
>  __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
>  {
> +	if (!pfn_valid(pfn))
> +		return;
>  	pc->flags = 0;
>  	pc->mem_cgroup = NULL;
>  	pc->page = pfn_to_page(pfn);
> @@ -41,10 +43,18 @@ static int __init alloc_node_page_cgroup
>  {
>  	struct page_cgroup *base, *pc;
>  	unsigned long table_size;
> -	unsigned long start_pfn, nr_pages, index;
> +	unsigned long start, end, start_pfn, nr_pages, index;
>  
> +	/*
> +	 * Instead of allocating page_cgroup for [start, end)
> +	 * We allocate page_cgroup to the same size of mem_map.
> +	 * See page_alloc.c::alloc_node_mem_map()
> +	 */
>  	start_pfn = NODE_DATA(nid)->node_start_pfn;
> -	nr_pages = NODE_DATA(nid)->node_spanned_pages;
> +	start = start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
> +	end = start_pfn	+ NODE_DATA(nid)->node_spanned_pages;
> +	end = ALIGN(end, MAX_ORDER_NR_PAGES);
> +	nr_pages = end - start;
>  
>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>  
> @@ -52,11 +62,12 @@ static int __init alloc_node_page_cgroup
>  			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>  	if (!base)
>  		return -ENOMEM;
> +
>  	for (index = 0; index < nr_pages; index++) {
>  		pc = base + index;
> -		__init_page_cgroup(pc, start_pfn + index);
> +		__init_page_cgroup(pc, start + index);
>  	}
> -	NODE_DATA(nid)->node_page_cgroup = base;
> +	NODE_DATA(nid)->node_page_cgroup = base + start_pfn - start;
>  	total_usage += table_size;
>  	return 0;
>  }
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  9:13                           ` Li Zefan
@ 2008-10-21  9:25                             ` KAMEZAWA Hiroyuki
  2008-10-21  9:54                               ` Li Zefan
  0 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  9:25 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 17:13:20 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 16:35:09 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> >> KAMEZAWA Hiroyuki wrote:
> >>> On Tue, 21 Oct 2008 15:21:07 +0800
> >>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>>> dmesg is attached.
> >>>>
> >>> Thanks....I think I caught some. (added Mel Gorman to CC:)
> >>>
> >>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> >>>
> >>> So, If there is a hole between zone, node->spanned_pages doesn't mean
> >>> length of node's memmap....(then, some hole can be skipped.)
> >>>
> >>> OMG....Could you try this ? 
> >>>
> >> No luck, the same bug still exists. :(
> >>
> > This is a little fixed one..
> > 
> 
> I tried the patch, but it doesn't solve the problem..
> 
Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
By patch like this ?

==
Index: linux-2.6.27/mm/memcontrol.c
===================================================================
--- linux-2.6.27.orig/mm/memcontrol.c
+++ linux-2.6.27/mm/memcontrol.c
@@ -544,6 +544,10 @@ static int mem_cgroup_charge_common(stru
 
 		goto done;
 	}
+
+	printk(KERN_DEBUG "pc %p pc->page %p page %p pfn %lx\n",
+			pc, pc->page, page, page_to_pfn(page));
+	BUG_ON(!pc->page);
 	pc->mem_cgroup = mem;
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  8:57                         ` KAMEZAWA Hiroyuki
  2008-10-21  9:13                           ` Li Zefan
@ 2008-10-21  9:33                           ` Daisuke Nishimura
  2008-10-21  9:41                             ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-21  9:33 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: nishimura, Li Zefan, balbir, Paul Menage, linux-mm, mel

[-- Attachment #1: Type: text/plain, Size: 3132 bytes --]

On Tue, 21 Oct 2008 17:57:35 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 21 Oct 2008 16:35:09 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
> > KAMEZAWA Hiroyuki wrote:
> > > On Tue, 21 Oct 2008 15:21:07 +0800
> > > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > >> dmesg is attached.
> > >>
> > > Thanks....I think I caught some. (added Mel Gorman to CC:)
> > > 
> > > NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > > 
> > > So, If there is a hole between zone, node->spanned_pages doesn't mean
> > > length of node's memmap....(then, some hole can be skipped.)
> > > 
> > > OMG....Could you try this ? 
> > > 
> > 
> > No luck, the same bug still exists. :(
> > 
> This is a little fixed one..
> 
I can reproduce a similar problem(hang on boot) on 2.6.27-git9,
but this patch doesn't help either on my environment...

I attach a console log(I've not seen NULL pointer dereference yet).


Daisuke Nishimura.

> please..
> -Kame
> ==
> NODE_DATA(nid)->node_spanned_pages doesn't means width of node's memory.
> 
> alloc_node_page_cgroup() misunderstand it. This patch tries to use
> the same algorithm as alloc_node_mem_map() for allocating page_cgroup()
> for node.
> 
> Changelog:
>  - fixed range of initialization loop.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
>  mm/page_cgroup.c |   19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6.27/mm/page_cgroup.c
> ===================================================================
> --- linux-2.6.27.orig/mm/page_cgroup.c
> +++ linux-2.6.27/mm/page_cgroup.c
> @@ -9,6 +9,8 @@
>  static void __meminit
>  __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
>  {
> +	if (!pfn_valid(pfn))
> +		return;
>  	pc->flags = 0;
>  	pc->mem_cgroup = NULL;
>  	pc->page = pfn_to_page(pfn);
> @@ -41,10 +43,18 @@ static int __init alloc_node_page_cgroup
>  {
>  	struct page_cgroup *base, *pc;
>  	unsigned long table_size;
> -	unsigned long start_pfn, nr_pages, index;
> +	unsigned long start, end, start_pfn, nr_pages, index;
>  
> +	/*
> +	 * Instead of allocating page_cgroup for [start, end)
> +	 * We allocate page_cgroup to the same size of mem_map.
> +	 * See page_alloc.c::alloc_node_mem_map()
> +	 */
>  	start_pfn = NODE_DATA(nid)->node_start_pfn;
> -	nr_pages = NODE_DATA(nid)->node_spanned_pages;
> +	start = start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
> +	end = start_pfn	+ NODE_DATA(nid)->node_spanned_pages;
> +	end = ALIGN(end, MAX_ORDER_NR_PAGES);
> +	nr_pages = end - start;
>  
>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>  
> @@ -52,11 +62,12 @@ static int __init alloc_node_page_cgroup
>  			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>  	if (!base)
>  		return -ENOMEM;
> +
>  	for (index = 0; index < nr_pages; index++) {
>  		pc = base + index;
> -		__init_page_cgroup(pc, start_pfn + index);
> +		__init_page_cgroup(pc, start + index);
>  	}
> -	NODE_DATA(nid)->node_page_cgroup = base;
> +	NODE_DATA(nid)->node_page_cgroup = base + start_pfn - start;
>  	total_usage += table_size;
>  	return 0;
>  }
> 


[-- Attachment #2: 2.6.27-git9-kame.log --]
[-- Type: application/octet-stream, Size: 18272 bytes --]

BIOS EBDA/lowmem at: 0009c000/0009c000
Initializing cgroup subsys cpuset
Initializing cgroup subsys cpu
Linux version 2.6.27-git9-kame (nishimura@GibsonE) (gcc version 4.1.2 20070626 (Red Hat 4.1.2-14)) #1 SMP Tue Oct 21 18:01:12 JST 2008
KERNEL supported cpus:
  Intel GenuineIntel
  AMD AuthenticAMD
  NSC Geode by NSC
  Cyrix CyrixInstead
  Centaur CentaurHauls
  Transmeta GenuineTMx86
  Transmeta TransmetaCPU
  UMC UMC UMC UMC
BIOS-provided physical RAM map:
 BIOS-e820: 0000000000000000 - 000000000009c000 (usable)
 BIOS-e820: 000000000009c000 - 00000000000a0000 (reserved)
 BIOS-e820: 00000000000d2000 - 00000000000d4000 (reserved)
 BIOS-e820: 00000000000dc000 - 0000000000100000 (reserved)
 BIOS-e820: 0000000000100000 - 00000000cfaf0000 (usable)
 BIOS-e820: 00000000cfaf0000 - 00000000cfafc000 (ACPI data)
 BIOS-e820: 00000000cfafc000 - 00000000cfb00000 (ACPI NVS)
 BIOS-e820: 00000000cfb00000 - 00000000cfc00000 (usable)
 BIOS-e820: 00000000cfc00000 - 00000000d0000000 (reserved)
 BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
 BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved)
 BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved)
 BIOS-e820: 00000000ff800000 - 00000000ffc00000 (reserved)
 BIOS-e820: 00000000fff00000 - 0000000100000000 (reserved)
 BIOS-e820: 0000000100000000 - 00000003b0000000 (usable)
DMI 2.3 present.
last_pfn = 0x3b0000 max_arch_pfn = 0x1000000
RAMDISK: 37d9e000 - 37fef778
Allocated new RAMDISK: 00100000 - 00351778
Move RAMDISK from 0000000037d9e000 - 0000000037fef777 to 00100000 - 00351777
ACPI: RSDP 000F6150, 0014 (r0 PTLTD )
ACPI: RSDT CFAF3ED5, 0034 (r1 PTLTD    RSDT    6040000  LTP        0)
ACPI: FACP CFAFBDCC, 0074 (r1 NEC    033D      6040000 NEC         0)
ACPI: DSDT CFAF3F09, 7EC3 (r1 NEC    033D      6040000 MSFT  100000E)
ACPI: FACS CFAFCFC0, 0040
ACPI: APIC CFAFBE40, 015C (r1 PTLTD      APIC    6040000  LTP        0)
ACPI: BOOT CFAFBF9C, 0028 (r1 PTLTD  $SBFTBL$  6040000  LTP        1)
ACPI: MCFG CFAFBFC4, 003C (r1 PTLTD    MCFG    6040000  LTP        0)
14216MB HIGHMEM available.
887MB LOWMEM available.
  mapped low ram: 0 - 377fe000
  low ram: 00000000 - 377fe000
  bootmap 0000a000 - 00010f00
(9 early reservations) ==> bootmem [0000000000 - 00377fe000]
  #0 [0000000000 - 0000001000]   BIOS data page ==> [0000000000 - 0000001000]
  #1 [0000001000 - 0000002000]    EX TRAMPOLINE ==> [0000001000 - 0000002000]
  #2 [0000006000 - 0000007000]       TRAMPOLINE ==> [0000006000 - 0000007000]
  #3 [0000400000 - 0000d96bac]    TEXT DATA BSS ==> [0000400000 - 0000d96bac]
  #4 [0000d97000 - 0000da2000]    INIT_PG_TABLE ==> [0000d97000 - 0000da2000]
  #5 [000009c000 - 0000100000]    BIOS reserved ==> [000009c000 - 0000100000]
  #6 [0000007000 - 000000a000]          PGTABLE ==> [0000007000 - 000000a000]
  #7 [0000100000 - 0000351778]      NEW RAMDISK ==> [0000100000 - 0000351778]
  #8 [000000a000 - 0000011000]          BOOTMAP ==> [000000a000 - 0000011000]
found SMP MP-table at [c00f6180] 000f6180
Reserving 128MB of memory at 16MB for crashkernel (System RAM: 15104MB)
Zone PFN ranges:
  DMA      0x00000000 -> 0x00001000
  Normal   0x00001000 -> 0x000377fe
  HighMem  0x000377fe -> 0x003b0000
Movable zone start PFN for each node
early_node_map[4] active PFN ranges
    0: 0x00000000 -> 0x0000009c
    0: 0x00000100 -> 0x000cfaf0
    0: 0x000cfb00 -> 0x000cfc00
    0: 0x00100000 -> 0x003b0000
Using APIC driver default
ACPI: PM-Timer IO Port: 0x808
ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
ACPI: LAPIC (acpi_id[0x01] lapic_id[0x06] enabled)
ACPI: LAPIC (acpi_id[0x02] lapic_id[0x08] enabled)
ACPI: LAPIC (acpi_id[0x03] lapic_id[0x0e] enabled)
ACPI: LAPIC (acpi_id[0x04] lapic_id[0x02] enabled)
ACPI: LAPIC (acpi_id[0x05] lapic_id[0x04] enabled)
ACPI: LAPIC (acpi_id[0x06] lapic_id[0x0a] enabled)
ACPI: LAPIC (acpi_id[0x07] lapic_id[0x0c] enabled)
ACPI: LAPIC (acpi_id[0x08] lapic_id[0x01] enabled)
ACPI: LAPIC (acpi_id[0x09] lapic_id[0x07] enabled)
ACPI: LAPIC (acpi_id[0x0a] lapic_id[0x09] enabled)
ACPI: LAPIC (acpi_id[0x0b] lapic_id[0x0f] enabled)
ACPI: LAPIC (acpi_id[0x0c] lapic_id[0x03] enabled)
ACPI: LAPIC (acpi_id[0x0d] lapic_id[0x05] enabled)
ACPI: LAPIC (acpi_id[0x0e] lapic_id[0x0b] enabled)
ACPI: LAPIC (acpi_id[0x0f] lapic_id[0x0d] enabled)
ACPI: LAPIC_NMI (acpi_id[0x00] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x01] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x02] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x03] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x04] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x05] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x06] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x07] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x08] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x09] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x0a] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x0b] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x0c] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x0d] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x0e] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x0f] high edge lint[0x1])
ACPI: IOAPIC (id[0x00] address[0xfec00000] gsi_base[0])
IOAPIC[0]: apic_id 0, version 32, address 0xfec00000, GSI 0-23
ACPI: IOAPIC (id[0x01] address[0xfec85000] gsi_base[24])
IOAPIC[1]: apic_id 1, version 32, address 0xfec85000, GSI 24-47
ACPI: IOAPIC (id[0x02] address[0xfec85400] gsi_base[48])
IOAPIC[2]: apic_id 2, version 32, address 0xfec85400, GSI 48-71
ACPI: IOAPIC (id[0x03] address[0xfec86000] gsi_base[72])
IOAPIC[3]: apic_id 3, version 32, address 0xfec86000, GSI 72-95
ACPI: IOAPIC (id[0x04] address[0xfec86400] gsi_base[96])
IOAPIC[4]: apic_id 4, version 32, address 0xfec86400, GSI 96-119
ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 high edge)
ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
Enabling APIC mode:  Flat.  Using 5 I/O APICs
Using ACPI (MADT) for SMP configuration information
SMP: Allowing 16 CPUs, 0 hotplug CPUs
Allocating PCI resources starting at d1000000 (gap: d0000000:10000000)
PERCPU: Allocating 1216512 bytes of per cpu data
Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 3616011
Kernel command line: ro root=LABEL=/ crashkernel=128M@16M console=tty console=ttyS0,115200 nmi_watchdog=0
Enabling fast FPU save and restore... done.
Enabling unmasked SIMD FPU exception support... done.
Initializing CPU#0
PID hash table entries: 4096 (order: 12, 16384 bytes)
TSC: PIT calibration matches PMTIMER. 2 loops
Detected 3000.107 MHz processor.
Console: colour VGA+ 80x25
console [tty0] enabled
console [ttyS0] enabled
Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
... MAX_LOCKDEP_SUBCLASSES:    8
... MAX_LOCK_DEPTH:          48
... MAX_LOCKDEP_KEYS:        8191
... CLASSHASH_SIZE:           4096
... MAX_LOCKDEP_ENTRIES:     8192
... MAX_LOCKDEP_CHAINS:      16384
... CHAINHASH_SIZE:          8192
 memory used by lock dependency info: 2463 kB
 per task-struct memory footprint: 1920 bytes
------------------------
| Locking API testsuite:
----------------------------------------------------------------------------
                                 | spin |wlock |rlock |mutex | wsem | rsem |
  --------------------------------------------------------------------------
                     A-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
                 A-B-B-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
             A-B-B-C-C-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
             A-B-C-A-B-C deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
         A-B-B-C-C-D-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
         A-B-C-D-B-D-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
         A-B-C-D-B-C-D-A deadlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
                    double unlock:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
                  initialize held:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
                 bad unlock order:  ok  |  ok  |  ok  |  ok  |  ok  |  ok  |
  --------------------------------------------------------------------------
              recursive read-lock:             |  ok  |             |  ok  |
           recursive read-lock #2:             |  ok  |             |  ok  |
            mixed read-write-lock:             |  ok  |             |  ok  |
            mixed write-read-lock:             |  ok  |             |  ok  |
  --------------------------------------------------------------------------
     hard-irqs-on + irq-safe-A/12:  ok  |  ok  |  ok  |
     soft-irqs-on + irq-safe-A/12:  ok  |  ok  |  ok  |
     hard-irqs-on + irq-safe-A/21:  ok  |  ok  |  ok  |
     soft-irqs-on + irq-safe-A/21:  ok  |  ok  |  ok  |
       sirq-safe-A => hirqs-on/12:  ok  |  ok  |  ok  |
       sirq-safe-A => hirqs-on/21:  ok  |  ok  |  ok  |
         hard-safe-A + irqs-on/12:  ok  |  ok  |  ok  |
         soft-safe-A + irqs-on/12:  ok  |  ok  |  ok  |
         hard-safe-A + irqs-on/21:  ok  |  ok  |  ok  |
         soft-safe-A + irqs-on/21:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #1/123:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #1/123:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #1/132:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #1/132:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #1/213:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #1/213:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #1/231:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #1/231:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #1/312:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #1/312:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #1/321:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #1/321:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #2/123:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #2/123:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #2/132:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #2/132:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #2/213:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #2/213:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #2/231:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #2/231:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #2/312:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #2/312:  ok  |  ok  |  ok  |
    hard-safe-A + unsafe-B #2/321:  ok  |  ok  |  ok  |
    soft-safe-A + unsafe-B #2/321:  ok  |  ok  |  ok  |
      hard-irq lock-inversion/123:  ok  |  ok  |  ok  |
      soft-irq lock-inversion/123:  ok  |  ok  |  ok  |
      hard-irq lock-inversion/132:  ok  |  ok  |  ok  |
      soft-irq lock-inversion/132:  ok  |  ok  |  ok  |
      hard-irq lock-inversion/213:  ok  |  ok  |  ok  |
      soft-irq lock-inversion/213:  ok  |  ok  |  ok  |
      hard-irq lock-inversion/231:  ok  |  ok  |  ok  |
      soft-irq lock-inversion/231:  ok  |  ok  |  ok  |
      hard-irq lock-inversion/312:  ok  |  ok  |  ok  |
      soft-irq lock-inversion/312:  ok  |  ok  |  ok  |
      hard-irq lock-inversion/321:  ok  |  ok  |  ok  |
      soft-irq lock-inversion/321:  ok  |  ok  |  ok  |
      hard-irq read-recursion/123:  ok  |
      soft-irq read-recursion/123:  ok  |
      hard-irq read-recursion/132:  ok  |
      soft-irq read-recursion/132:  ok  |
      hard-irq read-recursion/213:  ok  |
      soft-irq read-recursion/213:  ok  |
      hard-irq read-recursion/231:  ok  |
      soft-irq read-recursion/231:  ok  |
      hard-irq read-recursion/312:  ok  |
      soft-irq read-recursion/312:  ok  |
      hard-irq read-recursion/321:  ok  |
      soft-irq read-recursion/321:  ok  |
-------------------------------------------------------
Good, all 218 testcases passed! |
---------------------------------
Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
Memory: 14299736k/15466496k available (2396k kernel code, 374880k reserved, 1558k data, 1476k init, 13766600k highmem)
virtual kernel memory layout:
    fixmap  : 0xffc58000 - 0xfffff000   (3740 kB)
    pkmap   : 0xff800000 - 0xffa00000   (2048 kB)
    vmalloc : 0xf7ffe000 - 0xff7fe000   ( 120 MB)
    lowmem  : 0xc0000000 - 0xf77fe000   ( 887 MB)
      .init : 0xc07e1000 - 0xc0952000   (1476 kB)
      .data : 0xc06571b2 - 0xc07dcafc   (1558 kB)
      .text : 0xc0400000 - 0xc06571b2   (2396 kB)
Checking if this processor honours the WP bit even in supervisor mode...Ok.
SLUB: Genslabs=12, HWalign=128, Order=0-3, MinObjects=0, CPUs=16, Nodes=1
Calibrating delay loop (skipped), value calculated using timer frequency.. 6000.21 BogoMIPS (lpj=3000107)
Security Framework initialized
SELinux:  Initializing.
Mount-cache hash table entries: 512
Initializing cgroup subsys cpuacct
Initializing cgroup subsys memory
allocated 77332480 bytes of page_cgroup
please try cgroup_disable=memory option if you don't want
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 0
CPU: Processor Core ID: 0
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#0.
CPU0: Intel P4/Xeon Extended MCE MSRs (24) available
CPU0: Thermal monitoring enabled
using mwait in idle threads.
Checking 'hlt' instruction... OK.
ACPI: Core revision 20080609
..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
CPU0: Genuine Intel(R) CPU 3.00GHz stepping 08
lockdep: fixing up alternatives.
Booting processor 1 APIC 0x6 ip 0x6000
Initializing CPU#1
Calibrating delay using timer specific routine.. 5999.34 BogoMIPS (lpj=2999672)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 1
CPU: Processor Core ID: 1
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#1.
CPU1: Intel P4/Xeon Extended MCE MSRs (24) available
CPU1: Thermal monitoring enabled
CPU1: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#1]: passed.
lockdep: fixing up alternatives.
Booting processor 2 APIC 0x8 ip 0x6000
Initializing CPU#2
Calibrating delay using timer specific routine.. 5999.29 BogoMIPS (lpj=2999646)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 2
CPU: Processor Core ID: 0
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#2.
CPU2: Intel P4/Xeon Extended MCE MSRs (24) available
CPU2: Thermal monitoring enabled
CPU2: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#2]: passed.
lockdep: fixing up alternatives.
Booting processor 3 APIC 0xe ip 0x6000
Initializing CPU#3
Calibrating delay using timer specific routine.. 5999.29 BogoMIPS (lpj=2999645)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 3
CPU: Processor Core ID: 1
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#3.
CPU3: Intel P4/Xeon Extended MCE MSRs (24) available
CPU3: Thermal monitoring enabled
CPU3: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#3]: passed.
lockdep: fixing up alternatives.
Booting processor 4 APIC 0x2 ip 0x6000
Initializing CPU#4
Calibrating delay using timer specific routine.. 5999.34 BogoMIPS (lpj=2999672)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 0
CPU: Processor Core ID: 1
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#4.
CPU4: Intel P4/Xeon Extended MCE MSRs (24) available
CPU4: Thermal monitoring enabled
CPU4: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#4]: passed.
lockdep: fixing up alternatives.
Booting processor 5 APIC 0x4 ip 0x6000
Initializing CPU#5
Calibrating delay using timer specific routine.. 5999.35 BogoMIPS (lpj=2999678)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 1
CPU: Processor Core ID: 0
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#5.
CPU5: Intel P4/Xeon Extended MCE MSRs (24) available
CPU5: Thermal monitoring enabled
CPU5: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#5]: passed.
lockdep: fixing up alternatives.
Booting processor 6 APIC 0xa ip 0x6000
Initializing CPU#6
Calibrating delay using timer specific routine.. 5999.28 BogoMIPS (lpj=2999640)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 2
CPU: Processor Core ID: 1
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#6.
CPU6: Intel P4/Xeon Extended MCE MSRs (24) available
CPU6: Thermal monitoring enabled
CPU6: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#6]: passed.
lockdep: fixing up alternatives.
Booting processor 7 APIC 0xc ip 0x6000
Initializing CPU#7
Calibrating delay using timer specific routine.. 5999.27 BogoMIPS (lpj=2999637)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 3
CPU: Processor Core ID: 0
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#7.
CPU7: Intel P4/Xeon Extended MCE MSRs (24) available
CPU7: Thermal monitoring enabled
CPU7: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#7]: passed.
lockdep: fixing up alternatives.
Booting processor 8 APIC 0x1 ip 0x6000
Initializing CPU#8
Calibrating delay using timer specific routine.. 5999.22 BogoMIPS (lpj=2999613)
CPU: Trace cache: 12K uops, L1 D cache: 16K
CPU: L2 cache: 2048K
CPU: Physical Processor ID: 0
CPU: Processor Core ID: 0
Intel machine check architecture supported.
Intel machine check reporting enabled on CPU#8.
CPU8: Intel P4/Xeon Extended MCE MSRs (24) available
CPU8: Thermal monitoring enabled
CPU8: Genuine Intel(R) CPU 3.00GHz stepping 08
checking TSC synchronization [CPU#0 -> CPU#8]: passed.

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  9:33                           ` Daisuke Nishimura
@ 2008-10-21  9:41                             ` KAMEZAWA Hiroyuki
  2008-10-21 10:15                               ` Daisuke Nishimura
  0 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21  9:41 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: Li Zefan, balbir, Paul Menage, linux-mm, mel

On Tue, 21 Oct 2008 18:33:18 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Tue, 21 Oct 2008 17:57:35 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Tue, 21 Oct 2008 16:35:09 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> > > KAMEZAWA Hiroyuki wrote:
> > > > On Tue, 21 Oct 2008 15:21:07 +0800
> > > > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > >> dmesg is attached.
> > > >>
> > > > Thanks....I think I caught some. (added Mel Gorman to CC:)
> > > > 
> > > > NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > > > 
> > > > So, If there is a hole between zone, node->spanned_pages doesn't mean
> > > > length of node's memmap....(then, some hole can be skipped.)
> > > > 
> > > > OMG....Could you try this ? 
> > > > 
> > > 
> > > No luck, the same bug still exists. :(
> > > 
> > This is a little fixed one..
> > 
> I can reproduce a similar problem(hang on boot) on 2.6.27-git9,
> but this patch doesn't help either on my environment...
> 
> I attach a console log(I've not seen NULL pointer dereference yet).
> 
> 
Thanks....boots well if cgroup_disable=memory ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  9:25                             ` KAMEZAWA Hiroyuki
@ 2008-10-21  9:54                               ` Li Zefan
  2008-10-21 10:14                                 ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 60+ messages in thread
From: Li Zefan @ 2008-10-21  9:54 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 17:13:20 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Tue, 21 Oct 2008 16:35:09 +0800
>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>
>>>> KAMEZAWA Hiroyuki wrote:
>>>>> On Tue, 21 Oct 2008 15:21:07 +0800
>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>>> dmesg is attached.
>>>>>>
>>>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
>>>>>
>>>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
>>>>>
>>>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
>>>>> length of node's memmap....(then, some hole can be skipped.)
>>>>>
>>>>> OMG....Could you try this ? 
>>>>>
>>>> No luck, the same bug still exists. :(
>>>>
>>> This is a little fixed one..
>>>
>> I tried the patch, but it doesn't solve the problem..
>>
> Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
> By patch like this ?
> 

I got what you want:

pc c1d589dc pc->page 00000000 page c105f67c pfn 1d5b
...
pc c1d589f0 pc->page 00000000 page c105f6b0 pfn 1d5c

> ==
> Index: linux-2.6.27/mm/memcontrol.c
> ===================================================================
> --- linux-2.6.27.orig/mm/memcontrol.c
> +++ linux-2.6.27/mm/memcontrol.c
> @@ -544,6 +544,10 @@ static int mem_cgroup_charge_common(stru
>  
>  		goto done;
>  	}
> +
> +	printk(KERN_DEBUG "pc %p pc->page %p page %p pfn %lx\n",
> +			pc, pc->page, page, page_to_pfn(page));
> +	BUG_ON(!pc->page);
>  	pc->mem_cgroup = mem;
>  	/*
>  	 * If a page is accounted as a page cache, insert to inactive list.
> 
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  9:54                               ` Li Zefan
@ 2008-10-21 10:14                                 ` KAMEZAWA Hiroyuki
  2008-10-21 10:57                                   ` Li Zefan
  2008-10-21 10:58                                   ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Balbir Singh
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 10:14 UTC (permalink / raw)
  To: Li Zefan; +Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 17:54:28 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 17:13:20 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> >> KAMEZAWA Hiroyuki wrote:
> >>> On Tue, 21 Oct 2008 16:35:09 +0800
> >>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>>
> >>>> KAMEZAWA Hiroyuki wrote:
> >>>>> On Tue, 21 Oct 2008 15:21:07 +0800
> >>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>>>>> dmesg is attached.
> >>>>>>
> >>>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
> >>>>>
> >>>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> >>>>>
> >>>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
> >>>>> length of node's memmap....(then, some hole can be skipped.)
> >>>>>
> >>>>> OMG....Could you try this ? 
> >>>>>
> >>>> No luck, the same bug still exists. :(
> >>>>
> >>> This is a little fixed one..
> >>>
> >> I tried the patch, but it doesn't solve the problem..
> >>
> > Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
> > By patch like this ?
> > 
> 
> I got what you want:
> 
> pc c1d589dc pc->page 00000000 page c105f67c pfn 1d5b
> ...
> pc c1d589f0 pc->page 00000000 page c105f6b0 pfn 1d5c
>
Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
==
 Normal   0x00001000 -> 0x000373fe
==
This is appearently in the range of page_cgroup initialization.
(if pgdat->node_page_cgroup is initalized correctly...)

I think write to page_cgroup->page happens only at initialization.
Hmm ? not initilization failure but curruption ?

What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21  9:41                             ` KAMEZAWA Hiroyuki
@ 2008-10-21 10:15                               ` Daisuke Nishimura
  0 siblings, 0 replies; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-21 10:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: nishimura, Li Zefan, balbir, Paul Menage, linux-mm, mel

On Tue, 21 Oct 2008 18:41:38 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 21 Oct 2008 18:33:18 +0900
> Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:
> 
> > On Tue, 21 Oct 2008 17:57:35 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > On Tue, 21 Oct 2008 16:35:09 +0800
> > > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > 
> > > > KAMEZAWA Hiroyuki wrote:
> > > > > On Tue, 21 Oct 2008 15:21:07 +0800
> > > > > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > > >> dmesg is attached.
> > > > >>
> > > > > Thanks....I think I caught some. (added Mel Gorman to CC:)
> > > > > 
> > > > > NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > > > > 
> > > > > So, If there is a hole between zone, node->spanned_pages doesn't mean
> > > > > length of node's memmap....(then, some hole can be skipped.)
> > > > > 
> > > > > OMG....Could you try this ? 
> > > > > 
> > > > 
> > > > No luck, the same bug still exists. :(
> > > > 
> > > This is a little fixed one..
> > > 
> > I can reproduce a similar problem(hang on boot) on 2.6.27-git9,
> > but this patch doesn't help either on my environment...
> > 
> > I attach a console log(I've not seen NULL pointer dereference yet).
> > 
> > 
> Thanks....boots well if cgroup_disable=memory ?
> 
Hum.. "cgroup_disable=memory" doesn't work either in my environment...

Maybe, I'm hitting a different problem.


Daisuke Nishimura.

> Thanks,
> -Kame
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 10:14                                 ` KAMEZAWA Hiroyuki
@ 2008-10-21 10:57                                   ` Li Zefan
  2008-10-21 11:00                                     ` KAMEZAWA Hiroyuki
                                                       ` (2 more replies)
  2008-10-21 10:58                                   ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Balbir Singh
  1 sibling, 3 replies; 60+ messages in thread
From: Li Zefan @ 2008-10-21 10:57 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel, Ingo Molnar

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 17:54:28 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Tue, 21 Oct 2008 17:13:20 +0800
>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>
>>>> KAMEZAWA Hiroyuki wrote:
>>>>> On Tue, 21 Oct 2008 16:35:09 +0800
>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>>
>>>>>> KAMEZAWA Hiroyuki wrote:
>>>>>>> On Tue, 21 Oct 2008 15:21:07 +0800
>>>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>>>>> dmesg is attached.
>>>>>>>>
>>>>>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
>>>>>>>
>>>>>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
>>>>>>>
>>>>>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
>>>>>>> length of node's memmap....(then, some hole can be skipped.)
>>>>>>>
>>>>>>> OMG....Could you try this ? 
>>>>>>>
>>>>>> No luck, the same bug still exists. :(
>>>>>>
>>>>> This is a little fixed one..
>>>>>
>>>> I tried the patch, but it doesn't solve the problem..
>>>>
>>> Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
>>> By patch like this ?
>>>
>> I got what you want:
>>
>> pc c1d589dc pc->page 00000000 page c105f67c pfn 1d5b
>> ...
>> pc c1d589f0 pc->page 00000000 page c105f6b0 pfn 1d5c
>>
> Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> ==
>  Normal   0x00001000 -> 0x000373fe
> ==
> This is appearently in the range of page_cgroup initialization.
> (if pgdat->node_page_cgroup is initalized correctly...)
> 
> I think write to page_cgroup->page happens only at initialization.
> Hmm ? not initilization failure but curruption ?
> 

Yes, curruption. I didn't find informatation about initialization failure.

> What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> 

So I did this change, and the box booted up without any problem.

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5d86550..82a30b1 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
 
 	table_size = sizeof(struct page_cgroup) * nr_pages;
 
-	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	base = vmalloc_node(table_size, nid);
 	if (!base)
 		return -ENOMEM;
 	for (index = 0; index < nr_pages; index++) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 10:14                                 ` KAMEZAWA Hiroyuki
  2008-10-21 10:57                                   ` Li Zefan
@ 2008-10-21 10:58                                   ` Balbir Singh
  1 sibling, 0 replies; 60+ messages in thread
From: Balbir Singh @ 2008-10-21 10:58 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 17:54:28 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Tue, 21 Oct 2008 17:13:20 +0800
>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>
>>>> KAMEZAWA Hiroyuki wrote:
>>>>> On Tue, 21 Oct 2008 16:35:09 +0800
>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>>
>>>>>> KAMEZAWA Hiroyuki wrote:
>>>>>>> On Tue, 21 Oct 2008 15:21:07 +0800
>>>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>>>>> dmesg is attached.
>>>>>>>>
>>>>>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
>>>>>>>
>>>>>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
>>>>>>>
>>>>>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
>>>>>>> length of node's memmap....(then, some hole can be skipped.)
>>>>>>>
>>>>>>> OMG....Could you try this ? 
>>>>>>>
>>>>>> No luck, the same bug still exists. :(
>>>>>>
>>>>> This is a little fixed one..
>>>>>
>>>> I tried the patch, but it doesn't solve the problem..
>>>>
>>> Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
>>> By patch like this ?
>>>
>> I got what you want:
>>
>> pc c1d589dc pc->page 00000000 page c105f67c pfn 1d5b
>> ...
>> pc c1d589f0 pc->page 00000000 page c105f6b0 pfn 1d5c
>>
> Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> ==
>  Normal   0x00001000 -> 0x000373fe
> ==
> This is appearently in the range of page_cgroup initialization.
> (if pgdat->node_page_cgroup is initalized correctly.. == .)
> 
> I think write to page_cgroup->page happens only at initialization.
> Hmm ? not initilization failure but curruption ?
> 


0x3bff0 = 245744
Looking at dmesg, we used 4914560 for page_cgroup, page_cgroup is 20 bytes, so
the number of page_cgroups we have = 245728. The difference is 16

That would make sense, if we look at

early_node_map[2] active PFN ranges
    0: 0x00000010 -> 0x0000009f
    0: 0x00000100 -> 0x0003bff0

Node 0 starts from pfn 0x10 == 16.

OK, so we were able to allocate the page_cgroup, so either

1) Like Kamezawa suggested, there was corruption (very unlikely)
2) pfn_to_page() returned NULL
3) We did not initialize a certain set of page_cgroups

> What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?


-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 10:57                                   ` Li Zefan
@ 2008-10-21 11:00                                     ` KAMEZAWA Hiroyuki
  2008-10-21 11:09                                       ` KAMEZAWA Hiroyuki
  2008-10-21 11:13                                     ` KAMEZAWA Hiroyuki
  2008-10-21 11:19                                     ` Ingo Molnar
  2 siblings, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 11:00 UTC (permalink / raw)
  To: Li Zefan
  Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel, Ingo Molnar

On Tue, 21 Oct 2008 18:57:08 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 17:54:28 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> >> KAMEZAWA Hiroyuki wrote:
> >>> On Tue, 21 Oct 2008 17:13:20 +0800
> >>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>>
> >>>> KAMEZAWA Hiroyuki wrote:
> >>>>> On Tue, 21 Oct 2008 16:35:09 +0800
> >>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>>>>
> >>>>>> KAMEZAWA Hiroyuki wrote:
> >>>>>>> On Tue, 21 Oct 2008 15:21:07 +0800
> >>>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>>>>>>> dmesg is attached.
> >>>>>>>>
> >>>>>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
> >>>>>>>
> >>>>>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> >>>>>>>
> >>>>>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
> >>>>>>> length of node's memmap....(then, some hole can be skipped.)
> >>>>>>>
> >>>>>>> OMG....Could you try this ? 
> >>>>>>>
> >>>>>> No luck, the same bug still exists. :(
> >>>>>>
> >>>>> This is a little fixed one..
> >>>>>
> >>>> I tried the patch, but it doesn't solve the problem..
> >>>>
> >>> Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
> >>> By patch like this ?
> >>>
> >> I got what you want:
> >>
> >> pc c1d589dc pc->page 00000000 page c105f67c pfn 1d5b
> >> ...
> >> pc c1d589f0 pc->page 00000000 page c105f6b0 pfn 1d5c
> >>
> > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > ==
> >  Normal   0x00001000 -> 0x000373fe
> > ==
> > This is appearently in the range of page_cgroup initialization.
> > (if pgdat->node_page_cgroup is initalized correctly...)
> > 
> > I think write to page_cgroup->page happens only at initialization.
> > Hmm ? not initilization failure but curruption ?
> > 
> 
> Yes, curruption. I didn't find informatation about initialization failure.
> 
> > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > 
> 
> So I did this change, and the box booted up without any problem.
> 
Ok, I wonder bootmem we got is freed...or reset to 0.

Hmm..thank you very much!

Thanks,
-Kame

> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 5d86550..82a30b1 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
>  
>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>  
> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +	base = vmalloc_node(table_size, nid);
>  	if (!base)
>  		return -ENOMEM;
>  	for (index = 0; index < nr_pages; index++) {
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:00                                     ` KAMEZAWA Hiroyuki
@ 2008-10-21 11:09                                       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 11:09 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel,
	Ingo Molnar

On Tue, 21 Oct 2008 20:00:53 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Tue, 21 Oct 2008 18:57:08 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
> > KAMEZAWA Hiroyuki wrote:
> > > On Tue, 21 Oct 2008 17:54:28 +0800
> > > Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > 
> > >> KAMEZAWA Hiroyuki wrote:
> > >>> On Tue, 21 Oct 2008 17:13:20 +0800
> > >>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> > >>>
> > >>>> KAMEZAWA Hiroyuki wrote:
> > >>>>> On Tue, 21 Oct 2008 16:35:09 +0800
> > >>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> > >>>>>
> > >>>>>> KAMEZAWA Hiroyuki wrote:
> > >>>>>>> On Tue, 21 Oct 2008 15:21:07 +0800
> > >>>>>>> Li Zefan <lizf@cn.fujitsu.com> wrote:
> > >>>>>>>> dmesg is attached.
> > >>>>>>>>
> > >>>>>>> Thanks....I think I caught some. (added Mel Gorman to CC:)
> > >>>>>>>
> > >>>>>>> NODE_DATA(nid)->spanned_pages just means sum of zone->spanned_pages in node.
> > >>>>>>>
> > >>>>>>> So, If there is a hole between zone, node->spanned_pages doesn't mean
> > >>>>>>> length of node's memmap....(then, some hole can be skipped.)
> > >>>>>>>
> > >>>>>>> OMG....Could you try this ? 
> > >>>>>>>
> > >>>>>> No luck, the same bug still exists. :(
> > >>>>>>
> > >>>>> This is a little fixed one..
> > >>>>>
> > >>>> I tried the patch, but it doesn't solve the problem..
> > >>>>
> > >>> Hmm.. Can you catch "pfn" of troublesome page_cgroup ?
> > >>> By patch like this ?
> > >>>
> > >> I got what you want:
> > >>
> > >> pc c1d589dc pc->page 00000000 page c105f67c pfn 1d5b
> > >> ...
> > >> pc c1d589f0 pc->page 00000000 page c105f6b0 pfn 1d5c
> > >>
> > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > ==
> > >  Normal   0x00001000 -> 0x000373fe
> > > ==
> > > This is appearently in the range of page_cgroup initialization.
> > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > 
> > > I think write to page_cgroup->page happens only at initialization.
> > > Hmm ? not initilization failure but curruption ?
> > > 
> > 
> > Yes, curruption. I didn't find informatation about initialization failure.
> > 
> > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > 
> > 
> > So I did this change, and the box booted up without any problem.
> > 
> Ok, I wonder bootmem we got is freed...or reset to 0.
> 
> Hmm..thank you very much!
> 
Umm....this is maybe my mistake and I can't use alloc_bootmem() here ?
(means I should use alloc_pages() or vmalloc() here.)

?
-Kame

> Thanks,
> -Kame
> 
> > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > index 5d86550..82a30b1 100644
> > --- a/mm/page_cgroup.c
> > +++ b/mm/page_cgroup.c
> > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> >  
> >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> >  
> > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > +	base = vmalloc_node(table_size, nid);
> >  	if (!base)
> >  		return -ENOMEM;
> >  	for (index = 0; index < nr_pages; index++) {
> > 
> > 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 10:57                                   ` Li Zefan
  2008-10-21 11:00                                     ` KAMEZAWA Hiroyuki
@ 2008-10-21 11:13                                     ` KAMEZAWA Hiroyuki
  2008-10-21 11:19                                     ` Ingo Molnar
  2 siblings, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 11:13 UTC (permalink / raw)
  To: Li Zefan
  Cc: balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel, Ingo Molnar

On Tue, 21 Oct 2008 18:57:08 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
> So I did this change, and the box booted up without any problem.

Li-san, could use send this patch as a fix ?
It seems alloc_node_page_cgrop() is too late in init-path to call alloc_bootmem().

Thanks,
-kame

> 
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 5d86550..82a30b1 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
>  
>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>  
> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +	base = vmalloc_node(table_size, nid);
>  	if (!base)
>  		return -ENOMEM;
>  	for (index = 0; index < nr_pages; index++) {
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 10:57                                   ` Li Zefan
  2008-10-21 11:00                                     ` KAMEZAWA Hiroyuki
  2008-10-21 11:13                                     ` KAMEZAWA Hiroyuki
@ 2008-10-21 11:19                                     ` Ingo Molnar
  2008-10-21 11:23                                       ` KAMEZAWA Hiroyuki
  2 siblings, 1 reply; 60+ messages in thread
From: Ingo Molnar @ 2008-10-21 11:19 UTC (permalink / raw)
  To: Li Zefan
  Cc: KAMEZAWA Hiroyuki, balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

* Li Zefan <lizf@cn.fujitsu.com> wrote:

> > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > ==
> >  Normal   0x00001000 -> 0x000373fe
> > ==
> > This is appearently in the range of page_cgroup initialization.
> > (if pgdat->node_page_cgroup is initalized correctly...)
> > 
> > I think write to page_cgroup->page happens only at initialization.
> > Hmm ? not initilization failure but curruption ?
> > 
> 
> Yes, curruption. I didn't find informatation about initialization failure.
> 
> > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > 
> 
> So I did this change, and the box booted up without any problem.
> 
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 5d86550..82a30b1 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
>  
>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>  
> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +	base = vmalloc_node(table_size, nid);
>  	if (!base)
>  		return -ENOMEM;

i have this:

  CONFIG_FAILSLAB=y
  CONFIG_FAIL_PAGE_ALLOC=y
  # CONFIG_FAIL_MAKE_REQUEST is not set
  CONFIG_FAIL_IO_TIMEOUT=y

so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
and this code continued silently? vmalloc_node() probably is more 
agressive about allocating memory.

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:19                                     ` Ingo Molnar
@ 2008-10-21 11:23                                       ` KAMEZAWA Hiroyuki
  2008-10-21 11:28                                         ` Ingo Molnar
  2008-10-21 11:29                                         ` Balbir Singh
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 11:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 13:19:51 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Li Zefan <lizf@cn.fujitsu.com> wrote:
> 
> > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > ==
> > >  Normal   0x00001000 -> 0x000373fe
> > > ==
> > > This is appearently in the range of page_cgroup initialization.
> > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > 
> > > I think write to page_cgroup->page happens only at initialization.
> > > Hmm ? not initilization failure but curruption ?
> > > 
> > 
> > Yes, curruption. I didn't find informatation about initialization failure.
> > 
> > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > 
> > 
> > So I did this change, and the box booted up without any problem.
> > 
> > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > index 5d86550..82a30b1 100644
> > --- a/mm/page_cgroup.c
> > +++ b/mm/page_cgroup.c
> > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> >  
> >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> >  
> > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > +	base = vmalloc_node(table_size, nid);
> >  	if (!base)
> >  		return -ENOMEM;
> 
> i have this:
> 
>   CONFIG_FAILSLAB=y
>   CONFIG_FAIL_PAGE_ALLOC=y
>   # CONFIG_FAIL_MAKE_REQUEST is not set
>   CONFIG_FAIL_IO_TIMEOUT=y
> 
> so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> and this code continued silently? vmalloc_node() probably is more 
> agressive about allocating memory.
> 
Sorry. I think I cannot use alloc_bootmem() at this point because
it's too late in init-path. (we can use usual page allocator)
So, just replacing alloc_bootmem() with vmalloc_node() is a fix....

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:23                                       ` KAMEZAWA Hiroyuki
@ 2008-10-21 11:28                                         ` Ingo Molnar
  2008-10-21 11:32                                           ` KAMEZAWA Hiroyuki
  2008-10-21 11:29                                         ` Balbir Singh
  1 sibling, 1 reply; 60+ messages in thread
From: Ingo Molnar @ 2008-10-21 11:28 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Tue, 21 Oct 2008 13:19:51 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * Li Zefan <lizf@cn.fujitsu.com> wrote:
> > 
> > > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > > ==
> > > >  Normal   0x00001000 -> 0x000373fe
> > > > ==
> > > > This is appearently in the range of page_cgroup initialization.
> > > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > > 
> > > > I think write to page_cgroup->page happens only at initialization.
> > > > Hmm ? not initilization failure but curruption ?
> > > > 
> > > 
> > > Yes, curruption. I didn't find informatation about initialization failure.
> > > 
> > > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > > 
> > > 
> > > So I did this change, and the box booted up without any problem.
> > > 
> > > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > > index 5d86550..82a30b1 100644
> > > --- a/mm/page_cgroup.c
> > > +++ b/mm/page_cgroup.c
> > > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> > >  
> > >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> > >  
> > > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > > +	base = vmalloc_node(table_size, nid);
> > >  	if (!base)
> > >  		return -ENOMEM;
> > 
> > i have this:
> > 
> >   CONFIG_FAILSLAB=y
> >   CONFIG_FAIL_PAGE_ALLOC=y
> >   # CONFIG_FAIL_MAKE_REQUEST is not set
> >   CONFIG_FAIL_IO_TIMEOUT=y
> > 
> > so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> > and this code continued silently? vmalloc_node() probably is more 
> > agressive about allocating memory.
> > 
> Sorry. I think I cannot use alloc_bootmem() at this point because
> it's too late in init-path. (we can use usual page allocator)
> So, just replacing alloc_bootmem() with vmalloc_node() is a fix....

okay. So what is needed for the crash is:

 CONFIG_CGROUPS=y
 CONFIG_CGROUP_MEM_RES_CTLR=y

right?

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:23                                       ` KAMEZAWA Hiroyuki
  2008-10-21 11:28                                         ` Ingo Molnar
@ 2008-10-21 11:29                                         ` Balbir Singh
  2008-10-21 11:34                                           ` KAMEZAWA Hiroyuki
  2008-10-21 12:00                                           ` KAMEZAWA Hiroyuki
  1 sibling, 2 replies; 60+ messages in thread
From: Balbir Singh @ 2008-10-21 11:29 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 13:19:51 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
> 
>> * Li Zefan <lizf@cn.fujitsu.com> wrote:
>>
>>>> Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
>>>> ==
>>>>  Normal   0x00001000 -> 0x000373fe
>>>> ==
>>>> This is appearently in the range of page_cgroup initialization.
>>>> (if pgdat->node_page_cgroup is initalized correctly...)
>>>>
>>>> I think write to page_cgroup->page happens only at initialization.
>>>> Hmm ? not initilization failure but curruption ?
>>>>
>>> Yes, curruption. I didn't find informatation about initialization failure.
>>>
>>>> What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
>>>>
>>> So I did this change, and the box booted up without any problem.
>>>
>>> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
>>> index 5d86550..82a30b1 100644
>>> --- a/mm/page_cgroup.c
>>> +++ b/mm/page_cgroup.c
>>> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
>>>  
>>>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>>>  
>>> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
>>> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>>> +	base = vmalloc_node(table_size, nid);
>>>  	if (!base)
>>>  		return -ENOMEM;
>> i have this:
>>
>>   CONFIG_FAILSLAB=y
>>   CONFIG_FAIL_PAGE_ALLOC=y
>>   # CONFIG_FAIL_MAKE_REQUEST is not set
>>   CONFIG_FAIL_IO_TIMEOUT=y
>>
>> so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
>> and this code continued silently? vmalloc_node() probably is more 
>> agressive about allocating memory.
>>
> Sorry. I think I cannot use alloc_bootmem() at this point because
> it's too late in init-path. (we can use usual page allocator)
> So, just replacing alloc_bootmem() with vmalloc_node() is a fix....

Kamezawa-San,

I would prefer to use alloc_bootmem() instead of vmalloc_node(). May be we can
shift cgroups, so that we use early_init for allocating page_cgroups.  What do
you think?

-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:28                                         ` Ingo Molnar
@ 2008-10-21 11:32                                           ` KAMEZAWA Hiroyuki
  2008-10-21 11:38                                             ` Ingo Molnar
  2008-10-22  2:13                                             ` Daisuke Nishimura
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 11:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 13:28:43 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> 
> > On Tue, 21 Oct 2008 13:19:51 +0200
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > 
> > > * Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > 
> > > > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > > > ==
> > > > >  Normal   0x00001000 -> 0x000373fe
> > > > > ==
> > > > > This is appearently in the range of page_cgroup initialization.
> > > > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > > > 
> > > > > I think write to page_cgroup->page happens only at initialization.
> > > > > Hmm ? not initilization failure but curruption ?
> > > > > 
> > > > 
> > > > Yes, curruption. I didn't find informatation about initialization failure.
> > > > 
> > > > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > > > 
> > > > 
> > > > So I did this change, and the box booted up without any problem.
> > > > 
> > > > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > > > index 5d86550..82a30b1 100644
> > > > --- a/mm/page_cgroup.c
> > > > +++ b/mm/page_cgroup.c
> > > > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> > > >  
> > > >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> > > >  
> > > > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > > > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > > > +	base = vmalloc_node(table_size, nid);
> > > >  	if (!base)
> > > >  		return -ENOMEM;
> > > 
> > > i have this:
> > > 
> > >   CONFIG_FAILSLAB=y
> > >   CONFIG_FAIL_PAGE_ALLOC=y
> > >   # CONFIG_FAIL_MAKE_REQUEST is not set
> > >   CONFIG_FAIL_IO_TIMEOUT=y
> > > 
> > > so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> > > and this code continued silently? vmalloc_node() probably is more 
> > > agressive about allocating memory.
> > > 
> > Sorry. I think I cannot use alloc_bootmem() at this point because
> > it's too late in init-path. (we can use usual page allocator)
> > So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
> 
> okay. So what is needed for the crash is:
> 
>  CONFIG_CGROUPS=y
>  CONFIG_CGROUP_MEM_RES_CTLR=y
> 
yes. maybe.  I think you can avoid crash by cgroup_disable=memory boot option.
But Nishimura reports he cannot boot with cgroup_disable=memory. So I wonder
there may be something other. But calling alloc_bootmem() here should be avoided.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:29                                         ` Balbir Singh
@ 2008-10-21 11:34                                           ` KAMEZAWA Hiroyuki
  2008-10-21 12:00                                           ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 11:34 UTC (permalink / raw)
  To: balbir
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 16:59:28 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 13:19:51 +0200
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> >> * Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>
> >>>> Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> >>>> ==
> >>>>  Normal   0x00001000 -> 0x000373fe
> >>>> ==
> >>>> This is appearently in the range of page_cgroup initialization.
> >>>> (if pgdat->node_page_cgroup is initalized correctly...)
> >>>>
> >>>> I think write to page_cgroup->page happens only at initialization.
> >>>> Hmm ? not initilization failure but curruption ?
> >>>>
> >>> Yes, curruption. I didn't find informatation about initialization failure.
> >>>
> >>>> What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> >>>>
> >>> So I did this change, and the box booted up without any problem.
> >>>
> >>> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> >>> index 5d86550..82a30b1 100644
> >>> --- a/mm/page_cgroup.c
> >>> +++ b/mm/page_cgroup.c
> >>> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> >>>  
> >>>  	table_size = sizeof(struct page_cgroup) * nr_pages;
> >>>  
> >>> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> >>> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> >>> +	base = vmalloc_node(table_size, nid);
> >>>  	if (!base)
> >>>  		return -ENOMEM;
> >> i have this:
> >>
> >>   CONFIG_FAILSLAB=y
> >>   CONFIG_FAIL_PAGE_ALLOC=y
> >>   # CONFIG_FAIL_MAKE_REQUEST is not set
> >>   CONFIG_FAIL_IO_TIMEOUT=y
> >>
> >> so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> >> and this code continued silently? vmalloc_node() probably is more 
> >> agressive about allocating memory.
> >>
> > Sorry. I think I cannot use alloc_bootmem() at this point because
> > it's too late in init-path. (we can use usual page allocator)
> > So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
> 
> Kamezawa-San,
> 
> I would prefer to use alloc_bootmem() instead of vmalloc_node(). May be we can
> shift cgroups, so that we use early_init for allocating page_cgroups.  What do
> you think?
> 
yes, I think so. I think we'll have chance to modify this, later, after getting some
stable kernel.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:32                                           ` KAMEZAWA Hiroyuki
@ 2008-10-21 11:38                                             ` Ingo Molnar
  2008-10-22  2:13                                             ` Daisuke Nishimura
  1 sibling, 0 replies; 60+ messages in thread
From: Ingo Molnar @ 2008-10-21 11:38 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Li Zefan, balbir, Paul Menage, Daisuke Nishimura, linux-mm, mel

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Tue, 21 Oct 2008 13:28:43 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > 
> > > On Tue, 21 Oct 2008 13:19:51 +0200
> > > Ingo Molnar <mingo@elte.hu> wrote:
> > > 
> > > > 
> > > > * Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > > 
> > > > > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > > > > ==
> > > > > >  Normal   0x00001000 -> 0x000373fe
> > > > > > ==
> > > > > > This is appearently in the range of page_cgroup initialization.
> > > > > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > > > > 
> > > > > > I think write to page_cgroup->page happens only at initialization.
> > > > > > Hmm ? not initilization failure but curruption ?
> > > > > > 
> > > > > 
> > > > > Yes, curruption. I didn't find informatation about initialization failure.
> > > > > 
> > > > > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > > > > 
> > > > > 
> > > > > So I did this change, and the box booted up without any problem.
> > > > > 
> > > > > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > > > > index 5d86550..82a30b1 100644
> > > > > --- a/mm/page_cgroup.c
> > > > > +++ b/mm/page_cgroup.c
> > > > > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> > > > >  
> > > > >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> > > > >  
> > > > > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > > > > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > > > > +	base = vmalloc_node(table_size, nid);
> > > > >  	if (!base)
> > > > >  		return -ENOMEM;
> > > > 
> > > > i have this:
> > > > 
> > > >   CONFIG_FAILSLAB=y
> > > >   CONFIG_FAIL_PAGE_ALLOC=y
> > > >   # CONFIG_FAIL_MAKE_REQUEST is not set
> > > >   CONFIG_FAIL_IO_TIMEOUT=y
> > > > 
> > > > so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> > > > and this code continued silently? vmalloc_node() probably is more 
> > > > agressive about allocating memory.
> > > > 
> > > Sorry. I think I cannot use alloc_bootmem() at this point because
> > > it's too late in init-path. (we can use usual page allocator)
> > > So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
> > 
> > okay. So what is needed for the crash is:
> > 
> >  CONFIG_CGROUPS=y
> >  CONFIG_CGROUP_MEM_RES_CTLR=y
> > 
>
> yes. maybe.  I think you can avoid crash by cgroup_disable=memory boot 
> option. But Nishimura reports he cannot boot with 
> cgroup_disable=memory. So I wonder there may be something other. But 
> calling alloc_bootmem() here should be avoided.

the patch did the trick for my test-setup - no more crash during bootup.

Tested-by: Ingo Molnar <mingo@elte.hu>

(but i cannot vouch for the completeness of the fix, other than observe 
that it boots and works fine.)

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:29                                         ` Balbir Singh
  2008-10-21 11:34                                           ` KAMEZAWA Hiroyuki
@ 2008-10-21 12:00                                           ` KAMEZAWA Hiroyuki
  2008-10-21 12:14                                             ` Balbir Singh
  1 sibling, 1 reply; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 12:00 UTC (permalink / raw)
  To: balbir
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 16:59:28 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Tue, 21 Oct 2008 13:19:51 +0200
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> >> * Li Zefan <lizf@cn.fujitsu.com> wrote:
> >>
> >>>> Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> >>>> ==
> >>>>  Normal   0x00001000 -> 0x000373fe
> >>>> ==
> >>>> This is appearently in the range of page_cgroup initialization.
> >>>> (if pgdat->node_page_cgroup is initalized correctly...)
> >>>>
> >>>> I think write to page_cgroup->page happens only at initialization.
> >>>> Hmm ? not initilization failure but curruption ?
> >>>>
> >>> Yes, curruption. I didn't find informatation about initialization failure.
> >>>
> >>>> What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> >>>>
> >>> So I did this change, and the box booted up without any problem.
> >>>
> >>> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> >>> index 5d86550..82a30b1 100644
> >>> --- a/mm/page_cgroup.c
> >>> +++ b/mm/page_cgroup.c
> >>> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> >>>  
> >>>  	table_size = sizeof(struct page_cgroup) * nr_pages;
> >>>  
> >>> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> >>> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> >>> +	base = vmalloc_node(table_size, nid);
> >>>  	if (!base)
> >>>  		return -ENOMEM;
> >> i have this:
> >>
> >>   CONFIG_FAILSLAB=y
> >>   CONFIG_FAIL_PAGE_ALLOC=y
> >>   # CONFIG_FAIL_MAKE_REQUEST is not set
> >>   CONFIG_FAIL_IO_TIMEOUT=y
> >>
> >> so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> >> and this code continued silently? vmalloc_node() probably is more 
> >> agressive about allocating memory.
> >>
> > Sorry. I think I cannot use alloc_bootmem() at this point because
> > it's too late in init-path. (we can use usual page allocator)
> > So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
> 
> Kamezawa-San,
> 
> I would prefer to use alloc_bootmem() instead of vmalloc_node(). May be we can
> shift cgroups, so that we use early_init for allocating page_cgroups.  What do
> you think?
> 
I got an idea and maybe can send a patch soon. I'm now finding x86-32 box..

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 12:00                                           ` KAMEZAWA Hiroyuki
@ 2008-10-21 12:14                                             ` Balbir Singh
  2008-10-21 13:09                                               ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 60+ messages in thread
From: Balbir Singh @ 2008-10-21 12:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 16:59:28 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Tue, 21 Oct 2008 13:19:51 +0200
>>> Ingo Molnar <mingo@elte.hu> wrote:
>>>
>>>> * Li Zefan <lizf@cn.fujitsu.com> wrote:
>>>>
>>>>>> Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
>>>>>> ==
>>>>>>  Normal   0x00001000 -> 0x000373fe
>>>>>> ==
>>>>>> This is appearently in the range of page_cgroup initialization.
>>>>>> (if pgdat->node_page_cgroup is initalized correctly...)
>>>>>>
>>>>>> I think write to page_cgroup->page happens only at initialization.
>>>>>> Hmm ? not initilization failure but curruption ?
>>>>>>
>>>>> Yes, curruption. I didn't find informatation about initialization failure.
>>>>>
>>>>>> What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
>>>>>>
>>>>> So I did this change, and the box booted up without any problem.
>>>>>
>>>>> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
>>>>> index 5d86550..82a30b1 100644
>>>>> --- a/mm/page_cgroup.c
>>>>> +++ b/mm/page_cgroup.c
>>>>> @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
>>>>>  
>>>>>  	table_size = sizeof(struct page_cgroup) * nr_pages;
>>>>>  
>>>>> -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
>>>>> -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>>>>> +	base = vmalloc_node(table_size, nid);
>>>>>  	if (!base)
>>>>>  		return -ENOMEM;
>>>> i have this:
>>>>
>>>>   CONFIG_FAILSLAB=y
>>>>   CONFIG_FAIL_PAGE_ALLOC=y
>>>>   # CONFIG_FAIL_MAKE_REQUEST is not set
>>>>   CONFIG_FAIL_IO_TIMEOUT=y
>>>>
>>>> so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
>>>> and this code continued silently? vmalloc_node() probably is more 
>>>> agressive about allocating memory.
>>>>
>>> Sorry. I think I cannot use alloc_bootmem() at this point because
>>> it's too late in init-path. (we can use usual page allocator)
>>> So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
>> Kamezawa-San,
>>
>> I would prefer to use alloc_bootmem() instead of vmalloc_node(). May be we can
>> shift cgroups, so that we use early_init for allocating page_cgroups.  What do
>> you think?
>>
> I got an idea and maybe can send a patch soon. I'm now finding x86-32 box..

Please send it to me, I am able to reproduce the problem with my kvm setup on my
32 bit system. I can do a quick test/verification for you.

-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 12:14                                             ` Balbir Singh
@ 2008-10-21 13:09                                               ` KAMEZAWA Hiroyuki
  2008-10-21 13:25                                                 ` Balbir Singh
  2008-10-21 13:34                                                 ` Balbir Singh
  0 siblings, 2 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-21 13:09 UTC (permalink / raw)
  To: balbir
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

On Tue, 21 Oct 2008 17:44:40 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > I got an idea and maybe can send a patch soon. I'm now finding x86-32 box..
> 
> Please send it to me, I am able to reproduce the problem with my kvm setup on my
> 32 bit system. I can do a quick test/verification for you.
> 
Thanks. how about this ? test on x86-64 is done.
-Kame
==



page_cgroup_init() is called from mem_cgroup_init(). But at this
point, we cannot call alloc_bootmem().
(and this caused panic at boot.)

This patch moves page_cgroup_init() to init/main.c.

Time table is following:
==
  parse_args(). # we can trust mem_cgroup_subsys.disabled bit after this.
  ....
  cgroup_init_early()  # "early" init of cgroup.
  ....
  setup_arch()         # memmap is allocated.
  ...
  page_cgroup_init();
  mem_init();   # we cannot call alloc_bootmem after this.
  ....
  cgroup_init() # mem_cgroup is initialized.
==

Before page_cgroup_init(), mem_map must be initialized. So, 
I added page_cgroup_init() to init/main.c directly.

(*) maybe this is not very clean but cgroup_init_early() is too early
    and we have to use vmalloc instead of alloc_bootmem() in cgroup_init().
    usage of vmalloc area in x86-32 is important and we should avoid
    vmalloc() in x86-32. So, we want to use alloc_bootmem() from
    sutaible place.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 include/linux/page_cgroup.h |    1 +
 init/main.c                 |    2 ++
 mm/memcontrol.c             |    1 -
 mm/page_cgroup.c            |   35 ++++++++++++++++++++++++++++-------
 4 files changed, 31 insertions(+), 8 deletions(-)

Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -62,6 +62,7 @@
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -647,6 +648,7 @@ asmlinkage void __init start_kernel(void
 	vmalloc_init();
 	vfs_caches_init_early();
 	cpuset_init_early();
+	page_cgroup_init();
 	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
Index: linux-2.6/mm/memcontrol.c
===================================================================
--- linux-2.6.orig/mm/memcontrol.c
+++ linux-2.6/mm/memcontrol.c
@@ -1088,7 +1088,6 @@ mem_cgroup_create(struct cgroup_subsys *
 	int node;
 
 	if (unlikely((cont->parent) == NULL)) {
-		page_cgroup_init();
 		mem = &init_mem_cgroup;
 	} else {
 		mem = mem_cgroup_alloc();
Index: linux-2.6/include/linux/page_cgroup.h
===================================================================
--- linux-2.6.orig/include/linux/page_cgroup.h
+++ linux-2.6/include/linux/page_cgroup.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 #include <linux/bit_spinlock.h>
+
 /*
  * Page Cgroup can be considered as an extended mem_map.
  * A page_cgroup page is associated with every page descriptor. The
Index: linux-2.6/mm/page_cgroup.c
===================================================================
--- linux-2.6.orig/mm/page_cgroup.c
+++ linux-2.6/mm/page_cgroup.c
@@ -4,7 +4,12 @@
 #include <linux/bit_spinlock.h>
 #include <linux/page_cgroup.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/memory.h>
+#include <linux/cgroup.h>
+
+extern struct cgroup_subsys	mem_cgroup_subsys;
+
 
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -66,6 +71,9 @@ void __init page_cgroup_init(void)
 
 	int nid, fail;
 
+	if (mem_cgroup_subsys.disabled)
+		return;
+
 	for_each_online_node(nid)  {
 		fail = alloc_node_page_cgroup(nid);
 		if (fail)
@@ -106,9 +114,14 @@ int __meminit init_section_page_cgroup(u
 	nid = page_to_nid(pfn_to_page(pfn));
 
 	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-	base = kmalloc_node(table_size, GFP_KERNEL, nid);
-	if (!base)
-		base = vmalloc_node(table_size, nid);
+	if (slab_is_available()) {
+		base = kmalloc_node(table_size, GFP_KERNEL, nid);
+		if (!base)
+			base = vmalloc_node(table_size, nid);
+	} else {
+		base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
+				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	}
 
 	if (!base) {
 		printk(KERN_ERR "page cgroup allocation failure\n");
@@ -135,11 +148,16 @@ void __free_page_cgroup(unsigned long pf
 	if (!ms || !ms->page_cgroup)
 		return;
 	base = ms->page_cgroup + pfn;
-	ms->page_cgroup = NULL;
-	if (is_vmalloc_addr(base))
+	if (is_vmalloc_addr(base)) {
 		vfree(base);
-	else
-		kfree(base);
+		ms->page_cgroup = NULL;
+	} else {
+		struct page *page = virt_to_page(base);
+		if (!PageReserved(page)) { /* Is bootmem ? */
+			kfree(base);
+			ms->page_cgroup = NULL;
+		}
+	}
 }
 
 int online_page_cgroup(unsigned long start_pfn,
@@ -213,6 +231,9 @@ void __init page_cgroup_init(void)
 	unsigned long pfn;
 	int fail = 0;
 
+	if (mem_cgroup_subsys.disabled)
+		return;
+
 	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
 		if (!pfn_present(pfn))
 			continue;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 13:09                                               ` KAMEZAWA Hiroyuki
@ 2008-10-21 13:25                                                 ` Balbir Singh
  2008-10-21 13:34                                                 ` Balbir Singh
  1 sibling, 0 replies; 60+ messages in thread
From: Balbir Singh @ 2008-10-21 13:25 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 17:44:40 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>> I got an idea and maybe can send a patch soon. I'm now finding x86-32 box..
>> Please send it to me, I am able to reproduce the problem with my kvm setup on my
>> 32 bit system. I can do a quick test/verification for you.
>>
> Thanks. how about this ? test on x86-64 is done.
> -Kame
> ==

OK, I'll test it, believe it or not, I was trying a similar patch, although not
as comprehensive.

-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 13:09                                               ` KAMEZAWA Hiroyuki
  2008-10-21 13:25                                                 ` Balbir Singh
@ 2008-10-21 13:34                                                 ` Balbir Singh
  2008-10-21 13:44                                                   ` [memcg BUG] unable to handle kernel NULL pointer derefence at00000000 亀澤　寛之
  1 sibling, 1 reply; 60+ messages in thread
From: Balbir Singh @ 2008-10-21 13:34 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Ingo Molnar, Li Zefan, Paul Menage, Daisuke Nishimura, linux-mm, mel

KAMEZAWA Hiroyuki wrote:
> On Tue, 21 Oct 2008 17:44:40 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>> I got an idea and maybe can send a patch soon. I'm now finding x86-32 box..
>> Please send it to me, I am able to reproduce the problem with my kvm setup on my
>> 32 bit system. I can do a quick test/verification for you.
>>
> Thanks. how about this ? test on x86-64 is done.
> -Kame
> ==
> 
> 
> 
> page_cgroup_init() is called from mem_cgroup_init(). But at this
> point, we cannot call alloc_bootmem().
> (and this caused panic at boot.)
> 
> This patch moves page_cgroup_init() to init/main.c.
> 
> Time table is following:
> ==
>   parse_args(). # we can trust mem_cgroup_subsys.disabled bit after this.
>   ....
>   cgroup_init_early()  # "early" init of cgroup.
>   ....
>   setup_arch()         # memmap is allocated.
>   ...
>   page_cgroup_init();
>   mem_init();   # we cannot call alloc_bootmem after this.
>   ....
>   cgroup_init() # mem_cgroup is initialized.
> ==
> 
> Before page_cgroup_init(), mem_map must be initialized. So, 
> I added page_cgroup_init() to init/main.c directly.
> 
> (*) maybe this is not very clean but cgroup_init_early() is too early
>     and we have to use vmalloc instead of alloc_bootmem() in cgroup_init().
>     usage of vmalloc area in x86-32 is important and we should avoid
>     vmalloc() in x86-32. So, we want to use alloc_bootmem() from
>     sutaible place.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
>  include/linux/page_cgroup.h |    1 +
>  init/main.c                 |    2 ++
>  mm/memcontrol.c             |    1 -
>  mm/page_cgroup.c            |   35 ++++++++++++++++++++++++++++-------
>  4 files changed, 31 insertions(+), 8 deletions(-)
> 
> Index: linux-2.6/init/main.c
> ===================================================================
> --- linux-2.6.orig/init/main.c
> +++ linux-2.6/init/main.c
> @@ -62,6 +62,7 @@
>  #include <linux/signal.h>
>  #include <linux/idr.h>
>  #include <linux/ftrace.h>
> +#include <linux/page_cgroup.h>
> 
>  #include <asm/io.h>
>  #include <asm/bugs.h>
> @@ -647,6 +648,7 @@ asmlinkage void __init start_kernel(void
>  	vmalloc_init();
>  	vfs_caches_init_early();
>  	cpuset_init_early();
> +	page_cgroup_init();
>  	mem_init();
>  	enable_debug_pagealloc();
>  	cpu_hotplug_init();
> Index: linux-2.6/mm/memcontrol.c
> ===================================================================
> --- linux-2.6.orig/mm/memcontrol.c
> +++ linux-2.6/mm/memcontrol.c
> @@ -1088,7 +1088,6 @@ mem_cgroup_create(struct cgroup_subsys *
>  	int node;
> 
>  	if (unlikely((cont->parent) == NULL)) {
> -		page_cgroup_init();
>  		mem = &init_mem_cgroup;
>  	} else {
>  		mem = mem_cgroup_alloc();
> Index: linux-2.6/include/linux/page_cgroup.h
> ===================================================================
> --- linux-2.6.orig/include/linux/page_cgroup.h
> +++ linux-2.6/include/linux/page_cgroup.h
> @@ -3,6 +3,7 @@
> 
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  #include <linux/bit_spinlock.h>
> +
>  /*
>   * Page Cgroup can be considered as an extended mem_map.
>   * A page_cgroup page is associated with every page descriptor. The
> Index: linux-2.6/mm/page_cgroup.c
> ===================================================================
> --- linux-2.6.orig/mm/page_cgroup.c
> +++ linux-2.6/mm/page_cgroup.c
> @@ -4,7 +4,12 @@
>  #include <linux/bit_spinlock.h>
>  #include <linux/page_cgroup.h>
>  #include <linux/hash.h>
> +#include <linux/slab.h>
>  #include <linux/memory.h>
> +#include <linux/cgroup.h>
> +
> +extern struct cgroup_subsys	mem_cgroup_subsys;
> +
> 
>  static void __meminit
>  __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
> @@ -66,6 +71,9 @@ void __init page_cgroup_init(void)
> 
>  	int nid, fail;
> 
> +	if (mem_cgroup_subsys.disabled)
> +		return;
> +
>  	for_each_online_node(nid)  {
>  		fail = alloc_node_page_cgroup(nid);
>  		if (fail)
> @@ -106,9 +114,14 @@ int __meminit init_section_page_cgroup(u
>  	nid = page_to_nid(pfn_to_page(pfn));
> 
>  	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
> -	base = kmalloc_node(table_size, GFP_KERNEL, nid);
> -	if (!base)
> -		base = vmalloc_node(table_size, nid);
> +	if (slab_is_available()) {
> +		base = kmalloc_node(table_size, GFP_KERNEL, nid);
> +		if (!base)
> +			base = vmalloc_node(table_size, nid);
> +	} else {
> +		base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
> +				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> +	}
> 
>  	if (!base) {
>  		printk(KERN_ERR "page cgroup allocation failure\n");
> @@ -135,11 +148,16 @@ void __free_page_cgroup(unsigned long pf
>  	if (!ms || !ms->page_cgroup)
>  		return;
>  	base = ms->page_cgroup + pfn;
> -	ms->page_cgroup = NULL;
> -	if (is_vmalloc_addr(base))
> +	if (is_vmalloc_addr(base)) {
>  		vfree(base);
> -	else
> -		kfree(base);
> +		ms->page_cgroup = NULL;
> +	} else {
> +		struct page *page = virt_to_page(base);
> +		if (!PageReserved(page)) { /* Is bootmem ? */
> +			kfree(base);
> +			ms->page_cgroup = NULL;
> +		}
> +	}
>  }
> 
>  int online_page_cgroup(unsigned long start_pfn,
> @@ -213,6 +231,9 @@ void __init page_cgroup_init(void)
>  	unsigned long pfn;
>  	int fail = 0;
> 
> +	if (mem_cgroup_subsys.disabled)
> +		return;
> +
>  	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
>  		if (!pfn_present(pfn))
>  			continue;

Booted on x86_32 for me

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>

-- 
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at00000000
  2008-10-21 13:34                                                 ` Balbir Singh
@ 2008-10-21 13:44                                                   ` 亀澤　寛之
  0 siblings, 0 replies; 60+ messages in thread
From: 亀澤　寛之 @ 2008-10-21 13:44 UTC (permalink / raw)
  To: balbir
  Cc: KAMEZAWA Hiroyuki, Ingo Molnar, Li Zefan, Paul Menage,
	Daisuke Nishimura, linux-mm, mel

> KAMEZAWA Hiroyuki wrote:
>> On Tue, 21 Oct 2008 17:44:40 +0530
>> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>>> I got an idea and maybe can send a patch soon. I'm now finding x86-32
>>>> box..
>>> Please send it to me, I am able to reproduce the problem with my kvm
>>> setup on my
>>> 32 bit system. I can do a quick test/verification for you.
>>>
>> Thanks. how about this ? test on x86-64 is done.
>> -Kame
>> ==
>>
>>
>>
>> page_cgroup_init() is called from mem_cgroup_init(). But at this
>> point, we cannot call alloc_bootmem().
>> (and this caused panic at boot.)
>>
>> This patch moves page_cgroup_init() to init/main.c.
>>
>> Time table is following:
>> ==
>>   parse_args(). # we can trust mem_cgroup_subsys.disabled bit after
>> this.
>>   ....
>>   cgroup_init_early()  # "early" init of cgroup.
>>   ....
>>   setup_arch()         # memmap is allocated.
>>   ...
>>   page_cgroup_init();
>>   mem_init();   # we cannot call alloc_bootmem after this.
>>   ....
>>   cgroup_init() # mem_cgroup is initialized.
>> ==
>>
>> Before page_cgroup_init(), mem_map must be initialized. So,
>> I added page_cgroup_init() to init/main.c directly.
>>
>> (*) maybe this is not very clean but cgroup_init_early() is too early
>>     and we have to use vmalloc instead of alloc_bootmem() in
>> cgroup_init().
>>     usage of vmalloc area in x86-32 is important and we should avoid
>>     vmalloc() in x86-32. So, we want to use alloc_bootmem() from
>>     sutaible place.
>>
>> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>>
>>  include/linux/page_cgroup.h |    1 +
>>  init/main.c                 |    2 ++
>>  mm/memcontrol.c             |    1 -
>>  mm/page_cgroup.c            |   35 ++++++++++++++++++++++++++++-------
>>  4 files changed, 31 insertions(+), 8 deletions(-)
>>
>> Index: linux-2.6/init/main.c
>> ===================================================================
>> --- linux-2.6.orig/init/main.c
>> +++ linux-2.6/init/main.c
>> @@ -62,6 +62,7 @@
>>  #include <linux/signal.h>
>>  #include <linux/idr.h>
>>  #include <linux/ftrace.h>
>> +#include <linux/page_cgroup.h>
>>
>>  #include <asm/io.h>
>>  #include <asm/bugs.h>
>> @@ -647,6 +648,7 @@ asmlinkage void __init start_kernel(void
>>  	vmalloc_init();
>>  	vfs_caches_init_early();
>>  	cpuset_init_early();
>> +	page_cgroup_init();
>>  	mem_init();
>>  	enable_debug_pagealloc();
>>  	cpu_hotplug_init();
>> Index: linux-2.6/mm/memcontrol.c
>> ===================================================================
>> --- linux-2.6.orig/mm/memcontrol.c
>> +++ linux-2.6/mm/memcontrol.c
>> @@ -1088,7 +1088,6 @@ mem_cgroup_create(struct cgroup_subsys *
>>  	int node;
>>
>>  	if (unlikely((cont->parent) == NULL)) {
>> -		page_cgroup_init();
>>  		mem = &init_mem_cgroup;
>>  	} else {
>>  		mem = mem_cgroup_alloc();
>> Index: linux-2.6/include/linux/page_cgroup.h
>> ===================================================================
>> --- linux-2.6.orig/include/linux/page_cgroup.h
>> +++ linux-2.6/include/linux/page_cgroup.h
>> @@ -3,6 +3,7 @@
>>
>>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
>>  #include <linux/bit_spinlock.h>
>> +
>>  /*
>>   * Page Cgroup can be considered as an extended mem_map.
>>   * A page_cgroup page is associated with every page descriptor. The
>> Index: linux-2.6/mm/page_cgroup.c
>> ===================================================================
>> --- linux-2.6.orig/mm/page_cgroup.c
>> +++ linux-2.6/mm/page_cgroup.c
>> @@ -4,7 +4,12 @@
>>  #include <linux/bit_spinlock.h>
>>  #include <linux/page_cgroup.h>
>>  #include <linux/hash.h>
>> +#include <linux/slab.h>
>>  #include <linux/memory.h>
>> +#include <linux/cgroup.h>
>> +
>> +extern struct cgroup_subsys	mem_cgroup_subsys;
>> +
>>
>>  static void __meminit
>>  __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
>> @@ -66,6 +71,9 @@ void __init page_cgroup_init(void)
>>
>>  	int nid, fail;
>>
>> +	if (mem_cgroup_subsys.disabled)
>> +		return;
>> +
>>  	for_each_online_node(nid)  {
>>  		fail = alloc_node_page_cgroup(nid);
>>  		if (fail)
>> @@ -106,9 +114,14 @@ int __meminit init_section_page_cgroup(u
>>  	nid = page_to_nid(pfn_to_page(pfn));
>>
>>  	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
>> -	base = kmalloc_node(table_size, GFP_KERNEL, nid);
>> -	if (!base)
>> -		base = vmalloc_node(table_size, nid);
>> +	if (slab_is_available()) {
>> +		base = kmalloc_node(table_size, GFP_KERNEL, nid);
>> +		if (!base)
>> +			base = vmalloc_node(table_size, nid);
>> +	} else {
>> +		base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
>> +				PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>> +	}
>>
>>  	if (!base) {
>>  		printk(KERN_ERR "page cgroup allocation failure\n");
>> @@ -135,11 +148,16 @@ void __free_page_cgroup(unsigned long pf
>>  	if (!ms || !ms->page_cgroup)
>>  		return;
>>  	base = ms->page_cgroup + pfn;
>> -	ms->page_cgroup = NULL;
>> -	if (is_vmalloc_addr(base))
>> +	if (is_vmalloc_addr(base)) {
>>  		vfree(base);
>> -	else
>> -		kfree(base);
>> +		ms->page_cgroup = NULL;
>> +	} else {
>> +		struct page *page = virt_to_page(base);
>> +		if (!PageReserved(page)) { /* Is bootmem ? */
>> +			kfree(base);
>> +			ms->page_cgroup = NULL;
>> +		}
>> +	}
>>  }
>>
>>  int online_page_cgroup(unsigned long start_pfn,
>> @@ -213,6 +231,9 @@ void __init page_cgroup_init(void)
>>  	unsigned long pfn;
>>  	int fail = 0;
>>
>> +	if (mem_cgroup_subsys.disabled)
>> +		return;
>> +
>>  	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
>>  		if (!pfn_present(pfn))
>>  			continue;
>
> Booted on x86_32 for me
>
> Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
>

Thank you ! (I'll resend later if necessary.)

-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-21 11:32                                           ` KAMEZAWA Hiroyuki
  2008-10-21 11:38                                             ` Ingo Molnar
@ 2008-10-22  2:13                                             ` Daisuke Nishimura
  2008-10-22  2:31                                               ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 60+ messages in thread
From: Daisuke Nishimura @ 2008-10-22  2:13 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: nishimura, Ingo Molnar, Li Zefan, balbir, Paul Menage, linux-mm, mel

On Tue, 21 Oct 2008 20:32:15 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> On Tue, 21 Oct 2008 13:28:43 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
> 
> > 
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > 
> > > On Tue, 21 Oct 2008 13:19:51 +0200
> > > Ingo Molnar <mingo@elte.hu> wrote:
> > > 
> > > > 
> > > > * Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > > 
> > > > > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > > > > ==
> > > > > >  Normal   0x00001000 -> 0x000373fe
> > > > > > ==
> > > > > > This is appearently in the range of page_cgroup initialization.
> > > > > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > > > > 
> > > > > > I think write to page_cgroup->page happens only at initialization.
> > > > > > Hmm ? not initilization failure but curruption ?
> > > > > > 
> > > > > 
> > > > > Yes, curruption. I didn't find informatation about initialization failure.
> > > > > 
> > > > > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > > > > 
> > > > > 
> > > > > So I did this change, and the box booted up without any problem.
> > > > > 
> > > > > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > > > > index 5d86550..82a30b1 100644
> > > > > --- a/mm/page_cgroup.c
> > > > > +++ b/mm/page_cgroup.c
> > > > > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> > > > >  
> > > > >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> > > > >  
> > > > > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > > > > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > > > > +	base = vmalloc_node(table_size, nid);
> > > > >  	if (!base)
> > > > >  		return -ENOMEM;
> > > > 
> > > > i have this:
> > > > 
> > > >   CONFIG_FAILSLAB=y
> > > >   CONFIG_FAIL_PAGE_ALLOC=y
> > > >   # CONFIG_FAIL_MAKE_REQUEST is not set
> > > >   CONFIG_FAIL_IO_TIMEOUT=y
> > > > 
> > > > so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> > > > and this code continued silently? vmalloc_node() probably is more 
> > > > agressive about allocating memory.
> > > > 
> > > Sorry. I think I cannot use alloc_bootmem() at this point because
> > > it's too late in init-path. (we can use usual page allocator)
> > > So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
> > 
> > okay. So what is needed for the crash is:
> > 
> >  CONFIG_CGROUPS=y
> >  CONFIG_CGROUP_MEM_RES_CTLR=y
> > 
> yes. maybe.  I think you can avoid crash by cgroup_disable=memory boot option.
> But Nishimura reports he cannot boot with cgroup_disable=memory. So I wonder
> there may be something other. But calling alloc_bootmem() here should be avoided.
> 
The "hang on boot" problem I reported hapens also on !CONFIG_CGROUP kernel too,
and it seems to be H/W dependent.

I'll dig more when I have time.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

* Re: [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000
  2008-10-22  2:13                                             ` Daisuke Nishimura
@ 2008-10-22  2:31                                               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 60+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-10-22  2:31 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: Ingo Molnar, Li Zefan, balbir, Paul Menage, linux-mm, mel

On Wed, 22 Oct 2008 11:13:31 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> On Tue, 21 Oct 2008 20:32:15 +0900, KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > On Tue, 21 Oct 2008 13:28:43 +0200
> > Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > 
> > > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > 
> > > > On Tue, 21 Oct 2008 13:19:51 +0200
> > > > Ingo Molnar <mingo@elte.hu> wrote:
> > > > 
> > > > > 
> > > > > * Li Zefan <lizf@cn.fujitsu.com> wrote:
> > > > > 
> > > > > > > Oh! thanks...but it seems pc->page is NULL in the middle of ZONE_NORMAL..
> > > > > > > ==
> > > > > > >  Normal   0x00001000 -> 0x000373fe
> > > > > > > ==
> > > > > > > This is appearently in the range of page_cgroup initialization.
> > > > > > > (if pgdat->node_page_cgroup is initalized correctly...)
> > > > > > > 
> > > > > > > I think write to page_cgroup->page happens only at initialization.
> > > > > > > Hmm ? not initilization failure but curruption ?
> > > > > > > 
> > > > > > 
> > > > > > Yes, curruption. I didn't find informatation about initialization failure.
> > > > > > 
> > > > > > > What happens if replacing __alloc_bootmem() with vmalloc() in page_cgroup.c init ?
> > > > > > > 
> > > > > > 
> > > > > > So I did this change, and the box booted up without any problem.
> > > > > > 
> > > > > > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> > > > > > index 5d86550..82a30b1 100644
> > > > > > --- a/mm/page_cgroup.c
> > > > > > +++ b/mm/page_cgroup.c
> > > > > > @@ -48,8 +48,7 @@ static int __init alloc_node_page_cgroup(int nid)
> > > > > >  
> > > > > >  	table_size = sizeof(struct page_cgroup) * nr_pages;
> > > > > >  
> > > > > > -	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > > > > > -			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > > > > > +	base = vmalloc_node(table_size, nid);
> > > > > >  	if (!base)
> > > > > >  		return -ENOMEM;
> > > > > 
> > > > > i have this:
> > > > > 
> > > > >   CONFIG_FAILSLAB=y
> > > > >   CONFIG_FAIL_PAGE_ALLOC=y
> > > > >   # CONFIG_FAIL_MAKE_REQUEST is not set
> > > > >   CONFIG_FAIL_IO_TIMEOUT=y
> > > > > 
> > > > > so the bug was perhaps that the __alloc_bootmem_node_nopanic() failed 
> > > > > and this code continued silently? vmalloc_node() probably is more 
> > > > > agressive about allocating memory.
> > > > > 
> > > > Sorry. I think I cannot use alloc_bootmem() at this point because
> > > > it's too late in init-path. (we can use usual page allocator)
> > > > So, just replacing alloc_bootmem() with vmalloc_node() is a fix....
> > > 
> > > okay. So what is needed for the crash is:
> > > 
> > >  CONFIG_CGROUPS=y
> > >  CONFIG_CGROUP_MEM_RES_CTLR=y
> > > 
> > yes. maybe.  I think you can avoid crash by cgroup_disable=memory boot option.
> > But Nishimura reports he cannot boot with cgroup_disable=memory. So I wonder
> > there may be something other. But calling alloc_bootmem() here should be avoided.
> > 
> The "hang on boot" problem I reported hapens also on !CONFIG_CGROUP kernel too,
> and it seems to be H/W dependent.
> 
> I'll dig more when I have time.
> 
Oh, thank you for reporting. (my own small box hangs too but I couldn't find why yet.)

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 60+ messages in thread

end of thread, other threads:[~2008-10-22  2:31 UTC | newest]

Thread overview: 60+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-10-17 10:48 [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) Daisuke Nishimura
2008-10-17 10:56 ` [PATCH -mm 1/5] memcg: replace res_counter Daisuke Nishimura
2008-10-20 19:53   ` Paul Menage
2008-10-21  1:14     ` KAMEZAWA Hiroyuki
2008-10-21  1:29       ` Paul Menage
2008-10-21  1:49         ` KAMEZAWA Hiroyuki
2008-10-21  2:15           ` Paul Menage
2008-10-21  2:50             ` KAMEZAWA Hiroyuki
2008-10-21  2:20           ` Paul Menage
2008-10-21  3:03             ` KAMEZAWA Hiroyuki
2008-10-21  6:30               ` Paul Menage
2008-10-21  5:30       ` Balbir Singh
2008-10-21  5:39         ` KAMEZAWA Hiroyuki
2008-10-21  6:20           ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Li Zefan
2008-10-21  6:25             ` KAMEZAWA Hiroyuki
2008-10-21  6:28               ` Li Zefan
2008-10-21  6:38                 ` Daisuke Nishimura
2008-10-21  6:54             ` KAMEZAWA Hiroyuki
2008-10-21  7:04               ` Li Zefan
2008-10-21  7:16                 ` KAMEZAWA Hiroyuki
2008-10-21  7:21                   ` Li Zefan
2008-10-21  8:18                     ` KAMEZAWA Hiroyuki
2008-10-21  8:34                       ` Mel Gorman
2008-10-21  8:38                         ` KAMEZAWA Hiroyuki
2008-10-21  8:35                       ` Li Zefan
2008-10-21  8:36                         ` KAMEZAWA Hiroyuki
2008-10-21  8:57                         ` KAMEZAWA Hiroyuki
2008-10-21  9:13                           ` Li Zefan
2008-10-21  9:25                             ` KAMEZAWA Hiroyuki
2008-10-21  9:54                               ` Li Zefan
2008-10-21 10:14                                 ` KAMEZAWA Hiroyuki
2008-10-21 10:57                                   ` Li Zefan
2008-10-21 11:00                                     ` KAMEZAWA Hiroyuki
2008-10-21 11:09                                       ` KAMEZAWA Hiroyuki
2008-10-21 11:13                                     ` KAMEZAWA Hiroyuki
2008-10-21 11:19                                     ` Ingo Molnar
2008-10-21 11:23                                       ` KAMEZAWA Hiroyuki
2008-10-21 11:28                                         ` Ingo Molnar
2008-10-21 11:32                                           ` KAMEZAWA Hiroyuki
2008-10-21 11:38                                             ` Ingo Molnar
2008-10-22  2:13                                             ` Daisuke Nishimura
2008-10-22  2:31                                               ` KAMEZAWA Hiroyuki
2008-10-21 11:29                                         ` Balbir Singh
2008-10-21 11:34                                           ` KAMEZAWA Hiroyuki
2008-10-21 12:00                                           ` KAMEZAWA Hiroyuki
2008-10-21 12:14                                             ` Balbir Singh
2008-10-21 13:09                                               ` KAMEZAWA Hiroyuki
2008-10-21 13:25                                                 ` Balbir Singh
2008-10-21 13:34                                                 ` Balbir Singh
2008-10-21 13:44                                                   ` [memcg BUG] unable to handle kernel NULL pointer derefence at00000000 亀澤　寛之
2008-10-21 10:58                                   ` [memcg BUG] unable to handle kernel NULL pointer derefence at 00000000 Balbir Singh
2008-10-21  9:33                           ` Daisuke Nishimura
2008-10-21  9:41                             ` KAMEZAWA Hiroyuki
2008-10-21 10:15                               ` Daisuke Nishimura
2008-10-17 10:59 ` [PATCH -mm 2/5] memcg: mem_cgroup private ID Daisuke Nishimura
2008-10-17 11:01 ` [PATCH -mm 3/5] memcg: mem+swap controller Kconfig Daisuke Nishimura, KAMEZAWA Hiroyuki
2008-10-17 11:04 ` [PATCH -mm 4/5] memcg: mem+swap counter Daisuke Nishimura
2008-10-17 11:06 ` [PATCH -mm 5/5] memcg: mem+swap accounting Daisuke Nishimura
2008-10-20  0:24 ` [RFC][PATCH -mm 0/5] mem+swap resource controller(trial patch) KAMEZAWA Hiroyuki
2008-10-20  2:53   ` Daisuke Nishimura

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox