linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] memcg: towards I/O aware memcg v7.
@ 2010-09-01  6:39 KAMEZAWA Hiroyuki
  2010-09-01  6:41 ` [PATCH 1/5] cgroup: change allocation of css ID placement KAMEZAWA Hiroyuki
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  6:39 UTC (permalink / raw)
  To: linux-mm
  Cc: nishimura, akpm, balbir, linux-kernel, gthelen, Munehiro Ikeda,
	menage, lizf


Major changes from v6 is
 a) added documentation about CSS ID.
 b) fixed typos and bugs.
 c) refleshed some comments

based on mmotm-2010-08-27

Patch brief view:
 1. changes css ID allocation in kernel/cgroup.c
 2. use ID-array in memcg.
 3. record ID to page_cgroup rather than pointer.
 4. make update_file_mapped to be RCU aware routine instead of spinlock.
 5. make update_file_mapped as general-purpose function.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/5] cgroup: change allocation of css ID placement
  2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
@ 2010-09-01  6:41 ` KAMEZAWA Hiroyuki
  2010-09-09 16:32   ` Greg Thelen
  2010-09-01  6:42 ` [PATCH 2/5] memcg: more use of css ID in memcg KAMEZAWA Hiroyuki
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  6:41 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, gthelen,
	Munehiro Ikeda, menage, lizf

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, css'id is allocated after ->create() is called. But to make use of ID
in ->create(), it should be available before ->create().

In another thinking, considering the ID is tightly coupled with "css",
it should be allocated when "css" is allocated.
This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
memory and blkio are using ID. (To support complicated hierarchy walk.)

ID will be used in mem cgroup's ->create(), later.

This patch adds css ID documentation which is not provided.

Note:
If someone changes rules of css allocation, ID allocation should be changed.

Changelog: 2010/09/01
 - modified cgroups.txt

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/cgroups/cgroups.txt |   48 ++++++++++++++++++++++++++++++++++++
 block/blk-cgroup.c                |    9 ++++++
 include/linux/cgroup.h            |   16 ++++++------
 kernel/cgroup.c                   |   50 +++++++++++---------------------------
 mm/memcontrol.c                   |    5 +++
 5 files changed, 86 insertions(+), 42 deletions(-)

Index: mmotm-0827/kernel/cgroup.c
===================================================================
--- mmotm-0827.orig/kernel/cgroup.c
+++ mmotm-0827/kernel/cgroup.c
@@ -289,9 +289,6 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
-static int cgroup_init_idr(struct cgroup_subsys *ss,
-			   struct cgroup_subsys_state *css);
-
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
@@ -770,9 +767,6 @@ static struct backing_dev_info cgroup_ba
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
-static int alloc_css_id(struct cgroup_subsys *ss,
-			struct cgroup *parent, struct cgroup *child);
-
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -3258,7 +3252,8 @@ static void init_cgroup_css(struct cgrou
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
-	css->id = NULL;
+	if (!ss->use_id)
+		css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -3343,12 +3338,6 @@ static long cgroup_create(struct cgroup 
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
-		if (ss->use_id) {
-			err = alloc_css_id(ss, parent, cgrp);
-			if (err)
-				goto err_destroy;
-		}
-		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -3710,17 +3699,6 @@ int __init_or_module cgroup_load_subsys(
 
 	/* our new subsystem will be attached to the dummy hierarchy. */
 	init_cgroup_css(css, ss, dummytop);
-	/* init_idr must be after init_cgroup_css because it sets css->id. */
-	if (ss->use_id) {
-		int ret = cgroup_init_idr(ss, css);
-		if (ret) {
-			dummytop->subsys[ss->subsys_id] = NULL;
-			ss->destroy(ss, dummytop);
-			subsys[i] = NULL;
-			mutex_unlock(&cgroup_mutex);
-			return ret;
-		}
-	}
 
 	/*
 	 * Now we need to entangle the css into the existing css_sets. unlike
@@ -3889,8 +3867,6 @@ int __init cgroup_init(void)
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
-		if (ss->use_id)
-			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
 	}
 
 	/* Add init_css_set to the hash table */
@@ -4604,8 +4580,8 @@ err_out:
 
 }
 
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
-					    struct cgroup_subsys_state *rootcss)
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+			    struct cgroup_subsys_state *rootcss)
 {
 	struct css_id *newid;
 
@@ -4617,21 +4593,25 @@ static int __init_or_module cgroup_init_
 		return PTR_ERR(newid);
 
 	newid->stack[0] = newid->id;
-	newid->css = rootcss;
-	rootcss->id = newid;
+	rcu_assign_pointer(newid->css, rootcss);
+	rcu_assign_pointer(rootcss->id, newid);
 	return 0;
 }
 
-static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
-			struct cgroup *child)
+int alloc_css_id(struct cgroup_subsys *ss,
+	struct cgroup *cgrp, struct cgroup_subsys_state *css)
 {
 	int subsys_id, i, depth = 0;
-	struct cgroup_subsys_state *parent_css, *child_css;
+	struct cgroup_subsys_state *parent_css;
+	struct cgroup *parent;
 	struct css_id *child_id, *parent_id;
 
+	if (cgrp == dummytop)
+		return cgroup_init_idr(ss, css);
+
+	parent = cgrp->parent;
 	subsys_id = ss->subsys_id;
 	parent_css = parent->subsys[subsys_id];
-	child_css = child->subsys[subsys_id];
 	parent_id = parent_css->id;
 	depth = parent_id->depth + 1;
 
@@ -4646,7 +4626,7 @@ static int alloc_css_id(struct cgroup_su
 	 * child_id->css pointer will be set after this cgroup is available
 	 * see cgroup_populate_dir()
 	 */
-	rcu_assign_pointer(child_css->id, child_id);
+	rcu_assign_pointer(css->id, child_id);
 
 	return 0;
 }
Index: mmotm-0827/include/linux/cgroup.h
===================================================================
--- mmotm-0827.orig/include/linux/cgroup.h
+++ mmotm-0827/include/linux/cgroup.h
@@ -588,9 +588,11 @@ static inline int cgroup_attach_task_cur
 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
  * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
- * CSS ID is assigned at cgroup allocation (create) automatically
- * and removed when subsys calls free_css_id() function. This is because
- * the lifetime of cgroup_subsys_state is subsys's matter.
+ * CSS ID must be assigned by subsys itself at cgroup creation and deleted
+ * when subsys calls free_css_id() function. This is because the life time of
+ * of cgroup_subsys_state is subsys's matter.
+ *
+ * ID->css look up is available after cgroup's directory is populated.
  *
  * Looking up and scanning function should be called under rcu_read_lock().
  * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
@@ -598,10 +600,10 @@ static inline int cgroup_attach_task_cur
  * destroyed". The caller should check css and cgroup's status.
  */
 
-/*
- * Typically Called at ->destroy(), or somewhere the subsys frees
- * cgroup_subsys_state.
- */
+/* Should be called in ->create() by subsys itself */
+int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *newgr,
+		struct cgroup_subsys_state *css);
+/* Typically Called at ->destroy(), or somewhere the subsys frees css */
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 
 /* Find a cgroup_subsys_state which has given ID */
Index: mmotm-0827/mm/memcontrol.c
===================================================================
--- mmotm-0827.orig/mm/memcontrol.c
+++ mmotm-0827/mm/memcontrol.c
@@ -4141,6 +4141,11 @@ mem_cgroup_create(struct cgroup_subsys *
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 
+	error = alloc_css_id(ss, cont, &mem->css);
+	if (error)
+		goto free_out;
+	/* Here, css_id(&mem->css) works. but css_lookup(id)->mem doesn't */
+
 	/* root ? */
 	if (cont->parent == NULL) {
 		int cpu;
Index: mmotm-0827/block/blk-cgroup.c
===================================================================
--- mmotm-0827.orig/block/blk-cgroup.c
+++ mmotm-0827/block/blk-cgroup.c
@@ -958,9 +958,13 @@ blkiocg_create(struct cgroup_subsys *sub
 {
 	struct blkio_cgroup *blkcg;
 	struct cgroup *parent = cgroup->parent;
+	int ret;
 
 	if (!parent) {
 		blkcg = &blkio_root_cgroup;
+		ret = alloc_css_id(subsys, cgroup, &blkcg->css);
+		if (ret)
+			return ERR_PTR(ret);
 		goto done;
 	}
 
@@ -971,6 +975,11 @@ blkiocg_create(struct cgroup_subsys *sub
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 	if (!blkcg)
 		return ERR_PTR(-ENOMEM);
+	ret = alloc_css_id(subsys, cgroup, &blkcg->css);
+	if (ret) {
+		kfree(blkcg);
+		return ERR_PTR(ret);
+	}
 
 	blkcg->weight = BLKIO_WEIGHT_DEFAULT;
 done:
Index: mmotm-0827/Documentation/cgroups/cgroups.txt
===================================================================
--- mmotm-0827.orig/Documentation/cgroups/cgroups.txt
+++ mmotm-0827/Documentation/cgroups/cgroups.txt
@@ -621,6 +621,54 @@ and root cgroup. Currently this will onl
 the default hierarchy (which never has sub-cgroups) and a hierarchy
 that is being created/destroyed (and hence has no sub-cgroups).
 
+3.4 cgroup subsys state IDs.
+------------
+When subsystem sets use_id == true, an ID per [cgroup, subsys] is added
+and it will be tied to cgroup_subsys_state object.
+
+When use_id==true can use following interfaces. But please note that
+allocation/free an ID is subsystem's job because cgroup_subsys_state
+object's lifetime is subsystem's matter.
+
+unsigned short css_id(struct cgroup_subsys_state *css)
+
+Returns ID of cgroup_subsys_state
+
+unsigend short css_depth(struct cgroup_subsys_state *css)
+
+Returns the level which "css" is exisiting under hierarchy tree.
+The root cgroup's depth 0, its children are 1, children's children are
+2....
+
+int alloc_css_id(struct struct cgroup_subsys *ss, struct cgroup *newgr,
+                struct cgroup_subsys_state *css);
+
+Attach an new ID to given css under subsystem ([ss, cgroup])
+should be called in ->create() callback.
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
+
+Free ID attached to "css" under subsystem. Should be called before
+"css" is freed.
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
+
+Look up cgroup_subsys_state via ID. Should be called under rcu_read_lock().
+
+struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
+                struct cgroup_subsys_state *root, int *foundid);
+
+Returns ID which is under "root" i.e. under sub-directory of "root"
+cgroup's directory at considering cgroup hierarchy. The order of IDs
+returned by this function is not sorted. Please be careful.
+
+bool css_is_ancestor(struct cgroup_subsys_state *cg,
+                     const struct cgroup_subsys_state *root);
+
+Returns true if "root" and "cs" is under the same hierarchy and
+"root" can be found when you see all ->parent from "cs" until
+the root cgroup.
+
 4. Questions
 ============
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/5] memcg: more use of css ID in memcg.
  2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
  2010-09-01  6:41 ` [PATCH 1/5] cgroup: change allocation of css ID placement KAMEZAWA Hiroyuki
@ 2010-09-01  6:42 ` KAMEZAWA Hiroyuki
  2010-09-01  7:52   ` Daisuke Nishimura
  2010-09-01  6:43 ` [PATCH 3/5] memcg: use ID in page cgroup KAMEZAWA Hiroyuki
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  6:42 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, gthelen,
	Munehiro Ikeda, menage, lizf

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, memory cgroup has an ID per cgroup and make use of it at
 - hierarchy walk,
 - swap recording.

This patch is for making more use of it. The final purpose is
to replace page_cgroup->mem_cgroup's pointer to an unsigned short.

This patch caches a pointer of memcg in an array. By this, we
don't have to call css_lookup() which requires radix-hash walk.
This saves some amount of memory footprint at lookup memcg via id.

Changelog: 20100901
 - added unregster_memcg_id() and did some clean up.
 - removed ->valid.
 - fixed mem_cgroup_num counter handling.

Changelog: 20100825
 - applied comments.

Changelog: 20100811
 - adjusted onto mmotm-2010-08-11
 - fixed RCU related parts.
 - use attach_id() callback.

Changelog: 20100804
 - fixed description in init/Kconfig

Changelog: 20100730
 - fixed rcu_read_unlock() placement.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 init/Kconfig    |   10 +++++++
 mm/memcontrol.c |   71 ++++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 61 insertions(+), 20 deletions(-)

Index: mmotm-0827/mm/memcontrol.c
===================================================================
--- mmotm-0827.orig/mm/memcontrol.c
+++ mmotm-0827/mm/memcontrol.c
@@ -294,6 +294,33 @@ static bool move_file(void)
 					&mc.to->move_charge_at_immigrate);
 }
 
+/* 0 is unused */
+static atomic_t mem_cgroup_num;
+#define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
+static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
+
+/* Must be called under rcu_read_lock */
+static struct mem_cgroup *id_to_memcg(unsigned short id)
+{
+	struct mem_cgroup *mem;
+	mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
+	return mem;
+}
+
+static void register_memcg_id(struct mem_cgroup *mem)
+{
+	int id = css_id(&mem->css);
+	rcu_assign_pointer(mem_cgroups[id], mem);
+}
+
+static void unregister_memcg_id(struct mem_cgroup *mem)
+{
+	int id = css_id(&mem->css);
+	rcu_assign_pointer(mem_cgroups[id], NULL);
+	/* Wait until all references goes. */
+	synchronize_rcu();
+}
+
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
@@ -1847,18 +1874,7 @@ static void mem_cgroup_cancel_charge(str
  * it's concern. (dropping refcnt from swap can be called against removed
  * memcg.)
  */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
-	struct cgroup_subsys_state *css;
 
-	/* ID 0 is unused ID */
-	if (!id)
-		return NULL;
-	css = css_lookup(&mem_cgroup_subsys, id);
-	if (!css)
-		return NULL;
-	return container_of(css, struct mem_cgroup, css);
-}
 
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
@@ -1879,7 +1895,7 @@ struct mem_cgroup *try_get_mem_cgroup_fr
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup(ent);
 		rcu_read_lock();
-		mem = mem_cgroup_lookup(id);
+		mem = id_to_memcg(id);
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 		rcu_read_unlock();
@@ -2231,7 +2247,7 @@ __mem_cgroup_commit_charge_swapin(struct
 
 		id = swap_cgroup_record(ent, 0);
 		rcu_read_lock();
-		memcg = mem_cgroup_lookup(id);
+		memcg = id_to_memcg(id);
 		if (memcg) {
 			/*
 			 * This recorded memcg can be obsolete one. So, avoid
@@ -2240,9 +2256,10 @@ __mem_cgroup_commit_charge_swapin(struct
 			if (!mem_cgroup_is_root(memcg))
 				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 			mem_cgroup_swap_statistics(memcg, false);
+			rcu_read_unlock();
 			mem_cgroup_put(memcg);
-		}
-		rcu_read_unlock();
+		} else
+			rcu_read_unlock();
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
@@ -2495,7 +2512,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
 
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
-	memcg = mem_cgroup_lookup(id);
+	memcg = id_to_memcg(id);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
@@ -2504,9 +2521,10 @@ void mem_cgroup_uncharge_swap(swp_entry_
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
+		rcu_read_unlock();
 		mem_cgroup_put(memcg);
-	}
-	rcu_read_unlock();
+	} else
+		rcu_read_unlock();
 }
 
 /**
@@ -4010,6 +4028,9 @@ static struct mem_cgroup *mem_cgroup_all
 	struct mem_cgroup *mem;
 	int size = sizeof(struct mem_cgroup);
 
+	if (atomic_read(&mem_cgroup_num) == NR_MEMCG_GROUPS)
+		return NULL;
+
 	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		mem = kmalloc(size, GFP_KERNEL);
@@ -4028,6 +4049,8 @@ static struct mem_cgroup *mem_cgroup_all
 			vfree(mem);
 		mem = NULL;
 	}
+	if (mem)
+		atomic_inc(&mem_cgroup_num);
 	return mem;
 }
 
@@ -4049,6 +4072,7 @@ static void __mem_cgroup_free(struct mem
 	mem_cgroup_remove_from_trees(mem);
 	free_css_id(&mem_cgroup_subsys, &mem->css);
 
+	atomic_dec(&mem_cgroup_num);
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 
@@ -4059,6 +4083,12 @@ static void __mem_cgroup_free(struct mem
 		vfree(mem);
 }
 
+static void mem_cgroup_free(struct mem_cgroup *mem)
+{
+	unregister_memcg_id(mem);
+	__mem_cgroup_free(mem);
+}
+
 static void mem_cgroup_get(struct mem_cgroup *mem)
 {
 	atomic_inc(&mem->refcnt);
@@ -4068,7 +4098,7 @@ static void __mem_cgroup_put(struct mem_
 {
 	if (atomic_sub_and_test(count, &mem->refcnt)) {
 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
-		__mem_cgroup_free(mem);
+		mem_cgroup_free(mem);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
@@ -4189,9 +4219,10 @@ mem_cgroup_create(struct cgroup_subsys *
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
+	register_memcg_id(mem);
 	return &mem->css;
 free_out:
-	__mem_cgroup_free(mem);
+	mem_cgroup_free(mem);
 	root_mem_cgroup = NULL;
 	return ERR_PTR(error);
 }
Index: mmotm-0827/init/Kconfig
===================================================================
--- mmotm-0827.orig/init/Kconfig
+++ mmotm-0827/init/Kconfig
@@ -612,6 +612,16 @@ config CGROUP_MEM_RES_CTLR_SWAP
 	  Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
 	  size is 4096bytes, 512k per 1Gbytes of swap.
 
+config MEM_CGROUP_MAX_GROUPS
+	int "Maximum number of memory cgroups on a system"
+	range 1 65535
+	default 8192 if 64BIT
+	default 2048 if 32BIT
+	help
+	  Memory cgroup has limitation of the number of groups created.
+	  Please select your favorite value. The more you allow, the more
+	  memory(a pointer per group) will be consumed.
+
 menuconfig CGROUP_SCHED
 	bool "Group CPU scheduler"
 	depends on EXPERIMENTAL && CGROUPS

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 3/5] memcg: use ID in page cgroup
  2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
  2010-09-01  6:41 ` [PATCH 1/5] cgroup: change allocation of css ID placement KAMEZAWA Hiroyuki
  2010-09-01  6:42 ` [PATCH 2/5] memcg: more use of css ID in memcg KAMEZAWA Hiroyuki
@ 2010-09-01  6:43 ` KAMEZAWA Hiroyuki
  2010-09-01  6:45 ` [PATCH 4/5] memcg: light weight file mapped update lock system KAMEZAWA Hiroyuki
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  6:43 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, gthelen,
	Munehiro Ikeda, menage, lizf

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, addresses of memory cgroup can be calculated by their ID without complex.
This patch relplaces pc->mem_cgroup from a pointer to a unsigned short.
On 64bit architecture, this offers us more 6bytes room per page_cgroup.
Use 2bytes for blkio-cgroup's page tracking. More 4bytes will be used for
some light-weight concurrent access.

We may able to move this id onto flags field but ...go step by step.

Changelog: 20100824
 - fixed comments, and typo.
Changelog: 20100811
 - using new rcu APIs, as rcu_dereference_check() etc.
Changelog: 20100804
 - added comments to page_cgroup.h
Changelog: 20100730
 - fixed some garbage added by debug code in early stage

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/page_cgroup.h |    6 ++++
 mm/memcontrol.c             |   53 ++++++++++++++++++++++++++++----------------
 mm/page_cgroup.c            |    2 -
 3 files changed, 40 insertions(+), 21 deletions(-)

Index: mmotm-0827/include/linux/page_cgroup.h
===================================================================
--- mmotm-0827.orig/include/linux/page_cgroup.h
+++ mmotm-0827/include/linux/page_cgroup.h
@@ -9,10 +9,14 @@
  * page_cgroup helps us identify information about the cgroup
  * All page cgroups are allocated at boot or memory hotplug event,
  * then the page cgroup for pfn always exists.
+ *
+ * TODO: It seems ID for cgroup can be packed into "flags". But there will
+ * be race between assigning ID <-> set/clear flags. Please be careful.
  */
 struct page_cgroup {
 	unsigned long flags;
-	struct mem_cgroup *mem_cgroup;
+	unsigned short mem_cgroup;	/* ID of assigned memory cgroup */
+	unsigned short blk_cgroup;	/* Not Used..but will be. */
 	struct page *page;
 	struct list_head lru;		/* per cgroup LRU list */
 };
Index: mmotm-0827/mm/page_cgroup.c
===================================================================
--- mmotm-0827.orig/mm/page_cgroup.c
+++ mmotm-0827/mm/page_cgroup.c
@@ -15,7 +15,7 @@ static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
 {
 	pc->flags = 0;
-	pc->mem_cgroup = NULL;
+	pc->mem_cgroup = 0;
 	pc->page = pfn_to_page(pfn);
 	INIT_LIST_HEAD(&pc->lru);
 }
Index: mmotm-0827/mm/memcontrol.c
===================================================================
--- mmotm-0827.orig/mm/memcontrol.c
+++ mmotm-0827/mm/memcontrol.c
@@ -299,11 +299,15 @@ static atomic_t mem_cgroup_num;
 #define NR_MEMCG_GROUPS (CONFIG_MEM_CGROUP_MAX_GROUPS + 1)
 static struct mem_cgroup *mem_cgroups[NR_MEMCG_GROUPS] __read_mostly;
 
-/* Must be called under rcu_read_lock */
-static struct mem_cgroup *id_to_memcg(unsigned short id)
+/*
+ * Must be called under rcu_read_lock, Set safe==true if you're sure
+ * you're in safe condition...under lock_page_cgroup() etc.
+ */
+static struct mem_cgroup *id_to_memcg(unsigned short id, bool safe)
 {
 	struct mem_cgroup *mem;
-	mem = rcu_dereference_check(mem_cgroups[id], rcu_read_lock_held());
+	mem = rcu_dereference_check(mem_cgroups[id],
+				rcu_read_lock_held() || safe);
 	return mem;
 }
 
@@ -384,7 +388,12 @@ struct cgroup_subsys_state *mem_cgroup_c
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct page_cgroup *pc)
 {
-	struct mem_cgroup *mem = pc->mem_cgroup;
+	/*
+	 * The caller should guarantee this "pc" is under lock. In typical
+	 * case, this function is called by lru function with zone->lru_lock.
+	 * It is a safe access.
+	 */
+	struct mem_cgroup *mem = id_to_memcg(pc->mem_cgroup, true);
 	int nid = page_cgroup_nid(pc);
 	int zid = page_cgroup_zid(pc);
 
@@ -726,6 +735,11 @@ static inline bool mem_cgroup_is_root(st
 	return (mem == root_mem_cgroup);
 }
 
+static inline bool mem_cgroup_is_rootid(unsigned short id)
+{
+	return (id == 1);
+}
+
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
@@ -758,7 +772,7 @@ void mem_cgroup_del_lru_list(struct page
 	 */
 	mz = page_cgroup_zoneinfo(pc);
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
-	if (mem_cgroup_is_root(pc->mem_cgroup))
+	if (mem_cgroup_is_rootid(pc->mem_cgroup))
 		return;
 	VM_BUG_ON(list_empty(&pc->lru));
 	list_del_init(&pc->lru);
@@ -785,7 +799,7 @@ void mem_cgroup_rotate_lru_list(struct p
 	 */
 	smp_rmb();
 	/* unused or root page is not rotated. */
-	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+	if (!PageCgroupUsed(pc) || mem_cgroup_is_rootid(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc);
 	list_move(&pc->lru, &mz->lists[lru]);
@@ -811,7 +825,7 @@ void mem_cgroup_add_lru_list(struct page
 	mz = page_cgroup_zoneinfo(pc);
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	SetPageCgroupAcctLRU(pc);
-	if (mem_cgroup_is_root(pc->mem_cgroup))
+	if (mem_cgroup_is_rootid(pc->mem_cgroup))
 		return;
 	list_add(&pc->lru, &mz->lists[lru]);
 }
@@ -1500,7 +1514,7 @@ void mem_cgroup_update_file_mapped(struc
 		return;
 
 	lock_page_cgroup(pc);
-	mem = pc->mem_cgroup;
+	mem = id_to_memcg(pc->mem_cgroup, true);
 	if (!mem || !PageCgroupUsed(pc))
 		goto done;
 
@@ -1888,14 +1902,14 @@ struct mem_cgroup *try_get_mem_cgroup_fr
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
-		mem = pc->mem_cgroup;
+		mem = id_to_memcg(pc->mem_cgroup, true);
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup(ent);
 		rcu_read_lock();
-		mem = id_to_memcg(id);
+		mem = id_to_memcg(id, false);
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 		rcu_read_unlock();
@@ -1924,7 +1938,7 @@ static void __mem_cgroup_commit_charge(s
 		return;
 	}
 
-	pc->mem_cgroup = mem;
+	pc->mem_cgroup = css_id(&mem->css);
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -1982,7 +1996,7 @@ static void __mem_cgroup_move_account(st
 	VM_BUG_ON(PageLRU(pc->page));
 	VM_BUG_ON(!PageCgroupLocked(pc));
 	VM_BUG_ON(!PageCgroupUsed(pc));
-	VM_BUG_ON(pc->mem_cgroup != from);
+	VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from);
 
 	if (PageCgroupFileMapped(pc)) {
 		/* Update mapped_file data for mem_cgroup */
@@ -1997,7 +2011,7 @@ static void __mem_cgroup_move_account(st
 		mem_cgroup_cancel_charge(from);
 
 	/* caller should have done css_get */
-	pc->mem_cgroup = to;
+	pc->mem_cgroup = css_id(&to->css);
 	mem_cgroup_charge_statistics(to, pc, true);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
@@ -2017,7 +2031,7 @@ static int mem_cgroup_move_account(struc
 {
 	int ret = -EINVAL;
 	lock_page_cgroup(pc);
-	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+	if (PageCgroupUsed(pc) && id_to_memcg(pc->mem_cgroup, true) == from) {
 		__mem_cgroup_move_account(pc, from, to, uncharge);
 		ret = 0;
 	}
@@ -2247,7 +2261,7 @@ __mem_cgroup_commit_charge_swapin(struct
 
 		id = swap_cgroup_record(ent, 0);
 		rcu_read_lock();
-		memcg = id_to_memcg(id);
+		memcg = id_to_memcg(id, false);
 		if (memcg) {
 			/*
 			 * This recorded memcg can be obsolete one. So, avoid
@@ -2357,7 +2371,7 @@ __mem_cgroup_uncharge_common(struct page
 
 	lock_page_cgroup(pc);
 
-	mem = pc->mem_cgroup;
+	mem = id_to_memcg(pc->mem_cgroup, true);
 
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
@@ -2512,7 +2526,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
 
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
-	memcg = id_to_memcg(id);
+	memcg = id_to_memcg(id, false);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
@@ -2603,7 +2617,7 @@ int mem_cgroup_prepare_migration(struct 
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
-		mem = pc->mem_cgroup;
+		mem = id_to_memcg(pc->mem_cgroup, true);
 		css_get(&mem->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
@@ -4436,7 +4450,8 @@ static int is_target_pte_for_mc(struct v
 		 * mem_cgroup_move_account() checks the pc is valid or not under
 		 * the lock.
 		 */
-		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+		if (PageCgroupUsed(pc) &&
+			id_to_memcg(pc->mem_cgroup, true) == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 4/5] memcg: light weight file mapped update lock system
  2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
                   ` (2 preceding siblings ...)
  2010-09-01  6:43 ` [PATCH 3/5] memcg: use ID in page cgroup KAMEZAWA Hiroyuki
@ 2010-09-01  6:45 ` KAMEZAWA Hiroyuki
  2010-09-01  6:45 ` [PATCH 5/5] memcg: generic file status accounting KAMEZAWA Hiroyuki
  2010-09-09  4:44 ` [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
  5 siblings, 0 replies; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  6:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, gthelen,
	Munehiro Ikeda, menage, lizf

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

At accounting file events per memory cgroup, we need to find memory cgroup
via page_cgroup->mem_cgroup. Now, we use lock_page_cgroup().

But, considering the context which page-cgroup for files are accessed,
we can use alternative light-weight mutual execusion in the most case.

At handling file-caches, the only race we have to take care of is "moving"
account, IOW, overwriting page_cgroup->mem_cgroup. 
(See comment in the patch)

Unlike charge/uncharge, "move" happens not so frequently. It happens only when
rmdir() and task-moving (with a special settings.)
This patch adds a race-checker for file-cache-status accounting v.s. account
moving. The new per-cpu-per-memcg counter MEM_CGROUP_ON_MOVE is added.
The routine for account move 
  1. Increment it before start moving
  2. Call synchronize_rcu()
  3. Decrement it after the end of moving.
By this, file-status-counting routine can check it needs to call
lock_page_cgroup(). In most case, I doesn't need to call it.

Changelog: 20100901
 - changes id_to_memcg(pc, true) to be id_to_memcg(pc, false)
   in update_file_mapped()
 - updated comments on lock rule of update_file_mapped()
Changelog: 20100825
 - added a comment about mc.lock
 - fixed bad lock.
Changelog: 20100804
 - added a comment for possible optimization hint.
Changelog: 20100730
 - some cleanup.
Changelog: 20100729
 - replaced __this_cpu_xxx() with this_cpu_xxx
   (because we don't call spinlock)
 - added VM_BUG_ON().

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 mm/memcontrol.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 85 insertions(+), 13 deletions(-)

Index: mmotm-0827/mm/memcontrol.c
===================================================================
--- mmotm-0827.orig/mm/memcontrol.c
+++ mmotm-0827/mm/memcontrol.c
@@ -90,6 +90,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
+	MEM_CGROUP_ON_MOVE,   /* A check for locking move account/status */
 
 	MEM_CGROUP_STAT_NSTATS,
 };
@@ -1092,7 +1093,52 @@ static unsigned int get_swappiness(struc
 	return swappiness;
 }
 
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+	int cpu;
+	/*
+	 * reuse mc.lock.
+	 */
+	spin_lock(&mc.lock);
+	/* TODO: Can we optimize this by for_each_online_cpu() ? */
+	for_each_possible_cpu(cpu)
+		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+	spin_unlock(&mc.lock);
+
+	synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+	int cpu;
+
+	if (!mem)
+		return;
+	/* for fast checking in mem_cgroup_update_file_stat() etc..*/
+	spin_lock(&mc.lock);
+	for_each_possible_cpu(cpu)
+		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+	spin_unlock(&mc.lock);
+}
+
+/*
+ * mem_cgroup_is_moved -- checking a cgroup is mc.from target or not.
+ *                          used for avoiding race.
+ * mem_cgroup_under_move -- checking a cgroup is mc.from or mc.to or
+ *			    under hierarchy of them. used for waiting at
+ *			    memory pressure.
+ * Result of is_moved can be trusted until the end of rcu_read_unlock().
+ * The caller must do
+ *	rcu_read_lock();
+ *	result = mem_cgroup_is_moved();
+ *	.....make use of result here....
+ *	rcu_read_unlock();
+ */
+static bool mem_cgroup_is_moved(struct mem_cgroup *mem)
+{
+	VM_BUG_ON(!rcu_read_lock_held());
+	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
 
 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 {
@@ -1503,34 +1549,56 @@ bool mem_cgroup_handle_oom(struct mem_cg
 /*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
+ *
+ * Notes: How to consider race conditions.
+ *
+ * Lock against "charge" is not required. All operations happens after the
+ * page is attached to address_space->mapping, charge must be finished.
+ *
+ * Rquired lock against "move" is held if necessary and account move code
+ * does proper move of statistics data.
+ *
+ * Lock against "uncharge" is not required. Because uncharge() doesn't clear
+ * pc->mem_cgroup, we can catch proper memcg even under race with truncation.
+ * Another thinking, while we call this, we grab a page_count of the page. So,
+ * the page will be never reused for other purpose and page_cgroup->mem_cgroup
+ * will contain the last data it had.
  */
+
 void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
+	bool need_lock = false;
 
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc))
 		return;
-
-	lock_page_cgroup(pc);
-	mem = id_to_memcg(pc->mem_cgroup, true);
-	if (!mem || !PageCgroupUsed(pc))
+	rcu_read_lock();
+	mem = id_to_memcg(pc->mem_cgroup, false);
+	if (likely(mem)) {
+		if (mem_cgroup_is_moved(mem)) {
+			/* need to serialize with move_account */
+			lock_page_cgroup(pc);
+			need_lock = true;
+			mem = id_to_memcg(pc->mem_cgroup, true);
+			if (unlikely(!mem))
+				goto done;
+		}
+	}
+	if (unlikely(!PageCgroupUsed(pc)))
 		goto done;
-
-	/*
-	 * Preemption is already disabled. We can use __this_cpu_xxx
-	 */
 	if (val > 0) {
-		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		SetPageCgroupFileMapped(pc);
 	} else {
-		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		ClearPageCgroupFileMapped(pc);
 	}
-
 done:
-	unlock_page_cgroup(pc);
+	if (need_lock)
+		unlock_page_cgroup(pc);
+	rcu_read_unlock();
 }
 
 /*
@@ -3070,6 +3138,7 @@ move_account:
 		lru_add_drain_all();
 		drain_all_stock_sync();
 		ret = 0;
+		mem_cgroup_start_move(mem);
 		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list l;
@@ -3083,6 +3152,7 @@ move_account:
 			if (ret)
 				break;
 		}
+		mem_cgroup_end_move(mem);
 		memcg_oom_recover(mem);
 		/* it seems parent cgroup doesn't have enough mem */
 		if (ret == -ENOMEM)
@@ -4560,6 +4630,7 @@ static void mem_cgroup_clear_mc(void)
 	mc.to = NULL;
 	mc.moving_task = NULL;
 	spin_unlock(&mc.lock);
+	mem_cgroup_end_move(from);
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
@@ -4590,6 +4661,7 @@ static int mem_cgroup_can_attach(struct 
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			VM_BUG_ON(mc.moving_task);
+			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = mem;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 5/5] memcg: generic file status accounting
  2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
                   ` (3 preceding siblings ...)
  2010-09-01  6:45 ` [PATCH 4/5] memcg: light weight file mapped update lock system KAMEZAWA Hiroyuki
@ 2010-09-01  6:45 ` KAMEZAWA Hiroyuki
  2010-09-09  4:44 ` [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
  5 siblings, 0 replies; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-01  6:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, gthelen,
	Munehiro Ikeda, menage, lizf

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Preparing for adding new status arounf file caches.(dirty, writeback,etc..)
Using a unified macro and more generic names.
All counters will have the same rule for updating.

Regiewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/memcontrol.h  |   24 ++++++++++++++++++++
 include/linux/page_cgroup.h |   20 ++++++++++++-----
 mm/memcontrol.c             |   51 ++++++++++++++++++--------------------------
 3 files changed, 59 insertions(+), 36 deletions(-)

Index: mmotm-0827/include/linux/memcontrol.h
===================================================================
--- mmotm-0827.orig/include/linux/memcontrol.h
+++ mmotm-0827/include/linux/memcontrol.h
@@ -25,6 +25,30 @@ struct page_cgroup;
 struct page;
 struct mm_struct;
 
+/*
+ * Per-cpu Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+	/*
+	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+	 */
+	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
+	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
+	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
+	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
+	MEM_CGROUP_ON_MOVE,   /* A check for locking move account/status */
+	/* When you add new member for file-stat, please update page_cgroup.h */
+	MEM_CGROUP_FSTAT_BASE,
+	MEM_CGROUP_FSTAT_FILE_MAPPED = MEM_CGROUP_FSTAT_BASE,
+	MEM_CGROUP_FSTAT_END,
+	MEM_CGROUP_STAT_NSTATS = MEM_CGROUP_FSTAT_END,
+};
+
+#define MEMCG_FSTAT_IDX(idx)	((idx) - MEM_CGROUP_FSTAT_BASE)
+#define NR_FILE_FLAGS_MEMCG ((MEM_CGROUP_FSTAT_END - MEM_CGROUP_FSTAT_BASE))
+
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
Index: mmotm-0827/mm/memcontrol.c
===================================================================
--- mmotm-0827.orig/mm/memcontrol.c
+++ mmotm-0827/mm/memcontrol.c
@@ -76,24 +76,6 @@ static int really_do_swap_account __init
 #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
 #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
 
-/*
- * Statistics for memory cgroup.
- */
-enum mem_cgroup_stat_index {
-	/*
-	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
-	 */
-	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
-	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
-	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
-	MEM_CGROUP_ON_MOVE,   /* A check for locking move account/status */
-
-	MEM_CGROUP_STAT_NSTATS,
-};
 
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
@@ -1565,7 +1547,7 @@ bool mem_cgroup_handle_oom(struct mem_cg
  * will contain the last data it had.
  */
 
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
@@ -1589,11 +1571,11 @@ void mem_cgroup_update_file_mapped(struc
 	if (unlikely(!PageCgroupUsed(pc)))
 		goto done;
 	if (val > 0) {
-		this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		SetPageCgroupFileMapped(pc);
+		this_cpu_inc(mem->stat->count[idx]);
+		set_bit(fflag_idx(MEMCG_FSTAT_IDX(idx)), &pc->flags);
 	} else {
-		this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		ClearPageCgroupFileMapped(pc);
+		this_cpu_dec(mem->stat->count[idx]);
+		clear_bit(fflag_idx(MEMCG_FSTAT_IDX(idx)), &pc->flags);
 	}
 done:
 	if (need_lock)
@@ -1601,6 +1583,12 @@ done:
 	rcu_read_unlock();
 }
 
+void mem_cgroup_update_file_mapped(struct page *page, int val)
+{
+	return mem_cgroup_update_file_stat(page,
+		MEM_CGROUP_FSTAT_FILE_MAPPED, val);
+}
+
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
@@ -2060,17 +2048,20 @@ static void __mem_cgroup_commit_charge(s
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
+	int i;
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
 	VM_BUG_ON(!PageCgroupLocked(pc));
 	VM_BUG_ON(!PageCgroupUsed(pc));
 	VM_BUG_ON(id_to_memcg(pc->mem_cgroup, true) != from);
 
-	if (PageCgroupFileMapped(pc)) {
-		/* Update mapped_file data for mem_cgroup */
+	for (i = MEM_CGROUP_FSTAT_BASE; i < MEM_CGROUP_FSTAT_END; ++i) {
+		if (!test_bit(fflag_idx(MEMCG_FSTAT_IDX(i)), &pc->flags))
+			continue;
+		/* Update file statistics for mem_cgroup */
 		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+		__this_cpu_dec(from->stat->count[i]);
+		__this_cpu_inc(to->stat->count[i]);
 		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, pc, false);
@@ -3501,7 +3492,7 @@ static int mem_cgroup_get_local_stat(str
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+	val = mem_cgroup_read_stat(mem, MEM_CGROUP_FSTAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
Index: mmotm-0827/include/linux/page_cgroup.h
===================================================================
--- mmotm-0827.orig/include/linux/page_cgroup.h
+++ mmotm-0827/include/linux/page_cgroup.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 #include <linux/bit_spinlock.h>
+#include <linux/memcontrol.h> /* for flags */
 /*
  * Page Cgroup can be considered as an extended mem_map.
  * A page_cgroup page is associated with every page descriptor. The
@@ -43,10 +44,22 @@ enum {
 	PCG_CACHE, /* charged as cache */
 	PCG_USED, /* this object is in use. */
 	PCG_ACCT_LRU, /* page has been accounted for */
-	PCG_FILE_MAPPED, /* page is accounted as "mapped" */
 	PCG_MIGRATION, /* under page migration */
+	PCG_FILE_FLAGS_MEMCG, /* see memcontrol.h */
+	PCG_FILE_FLAGS_MEMCG_END
+		= PCG_FILE_FLAGS_MEMCG + NR_FILE_FLAGS_MEMCG - 1,
 };
 
+/*
+ * file-stat flags are defined regarding to memcg's stat information.
+ * Here, just defines a macro for indexing
+ */
+static inline int fflag_idx(int idx)
+{
+	VM_BUG_ON((idx) >= NR_FILE_FLAGS_MEMCG);
+	return (idx) + PCG_FILE_FLAGS_MEMCG;
+}
+
 #define TESTPCGFLAG(uname, lname)			\
 static inline int PageCgroup##uname(struct page_cgroup *pc)	\
 	{ return test_bit(PCG_##lname, &pc->flags); }
@@ -79,11 +92,6 @@ CLEARPCGFLAG(AcctLRU, ACCT_LRU)
 TESTPCGFLAG(AcctLRU, ACCT_LRU)
 TESTCLEARPCGFLAG(AcctLRU, ACCT_LRU)
 
-
-SETPCGFLAG(FileMapped, FILE_MAPPED)
-CLEARPCGFLAG(FileMapped, FILE_MAPPED)
-TESTPCGFLAG(FileMapped, FILE_MAPPED)
-
 SETPCGFLAG(Migration, MIGRATION)
 CLEARPCGFLAG(Migration, MIGRATION)
 TESTPCGFLAG(Migration, MIGRATION)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/5] memcg: more use of css ID in memcg.
  2010-09-01  6:42 ` [PATCH 2/5] memcg: more use of css ID in memcg KAMEZAWA Hiroyuki
@ 2010-09-01  7:52   ` Daisuke Nishimura
  0 siblings, 0 replies; 10+ messages in thread
From: Daisuke Nishimura @ 2010-09-01  7:52 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, akpm, balbir, linux-kernel, gthelen, Munehiro Ikeda,
	menage, lizf, Daisuke Nishimura

On Wed, 1 Sep 2010 15:42:59 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
> Now, memory cgroup has an ID per cgroup and make use of it at
>  - hierarchy walk,
>  - swap recording.
> 
> This patch is for making more use of it. The final purpose is
> to replace page_cgroup->mem_cgroup's pointer to an unsigned short.
> 
> This patch caches a pointer of memcg in an array. By this, we
> don't have to call css_lookup() which requires radix-hash walk.
> This saves some amount of memory footprint at lookup memcg via id.
> 
> Changelog: 20100901
>  - added unregster_memcg_id() and did some clean up.
>  - removed ->valid.
>  - fixed mem_cgroup_num counter handling.
> 
> Changelog: 20100825
>  - applied comments.
> 
> Changelog: 20100811
>  - adjusted onto mmotm-2010-08-11
>  - fixed RCU related parts.
>  - use attach_id() callback.
> 
> Changelog: 20100804
>  - fixed description in init/Kconfig
> 
> Changelog: 20100730
>  - fixed rcu_read_unlock() placement.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 0/5] memcg: towards I/O aware memcg v7.
  2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
                   ` (4 preceding siblings ...)
  2010-09-01  6:45 ` [PATCH 5/5] memcg: generic file status accounting KAMEZAWA Hiroyuki
@ 2010-09-09  4:44 ` KAMEZAWA Hiroyuki
  5 siblings, 0 replies; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-09  4:44 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, gthelen,
	Munehiro Ikeda, menage, lizf

On Wed, 1 Sep 2010 15:39:51 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> 
> Major changes from v6 is
>  a) added documentation about CSS ID.
>  b) fixed typos and bugs.
>  c) refleshed some comments
> 
> based on mmotm-2010-08-27
> 
> Patch brief view:
>  1. changes css ID allocation in kernel/cgroup.c
>  2. use ID-array in memcg.
>  3. record ID to page_cgroup rather than pointer.
>  4. make update_file_mapped to be RCU aware routine instead of spinlock.
>  5. make update_file_mapped as general-purpose function.
> 
> 

It seems that it will be better to re-order this series into

 A-part: lockless update of stats.
 B-part: ID managenet.

2 series. (to get ack.) Thank you for all helps.
BTW, what's the problem with ID patches ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/5] cgroup: change allocation of css ID placement
  2010-09-01  6:41 ` [PATCH 1/5] cgroup: change allocation of css ID placement KAMEZAWA Hiroyuki
@ 2010-09-09 16:32   ` Greg Thelen
  2010-09-09 23:46     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 10+ messages in thread
From: Greg Thelen @ 2010-09-09 16:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, Munehiro Ikeda,
	menage, lizf

On Tue, Aug 31, 2010 at 11:41 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, css'id is allocated after ->create() is called. But to make use of ID
> in ->create(), it should be available before ->create().
>
> In another thinking, considering the ID is tightly coupled with "css",
> it should be allocated when "css" is allocated.
> This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
> memory and blkio are using ID. (To support complicated hierarchy walk.)
>
> ID will be used in mem cgroup's ->create(), later.
>
> This patch adds css ID documentation which is not provided.
>
> Note:
> If someone changes rules of css allocation, ID allocation should be changed.
>
> Changelog: 2010/09/01
>  - modified cgroups.txt
>
> Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  Documentation/cgroups/cgroups.txt |   48 ++++++++++++++++++++++++++++++++++++
>  block/blk-cgroup.c                |    9 ++++++
>  include/linux/cgroup.h            |   16 ++++++------
>  kernel/cgroup.c                   |   50 +++++++++++---------------------------
>  mm/memcontrol.c                   |    5 +++
>  5 files changed, 86 insertions(+), 42 deletions(-)
>
> Index: mmotm-0827/kernel/cgroup.c
> ===================================================================
> --- mmotm-0827.orig/kernel/cgroup.c
> +++ mmotm-0827/kernel/cgroup.c
> @@ -289,9 +289,6 @@ struct cg_cgroup_link {
>  static struct css_set init_css_set;
>  static struct cg_cgroup_link init_css_set_link;
>
> -static int cgroup_init_idr(struct cgroup_subsys *ss,
> -                          struct cgroup_subsys_state *css);
> -
>  /* css_set_lock protects the list of css_set objects, and the
>  * chain of tasks off each css_set.  Nests outside task->alloc_lock
>  * due to cgroup_iter_start() */
> @@ -770,9 +767,6 @@ static struct backing_dev_info cgroup_ba
>        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
>  };
>
> -static int alloc_css_id(struct cgroup_subsys *ss,
> -                       struct cgroup *parent, struct cgroup *child);
> -
>  static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
>  {
>        struct inode *inode = new_inode(sb);
> @@ -3258,7 +3252,8 @@ static void init_cgroup_css(struct cgrou
>        css->cgroup = cgrp;
>        atomic_set(&css->refcnt, 1);
>        css->flags = 0;
> -       css->id = NULL;
> +       if (!ss->use_id)
> +               css->id = NULL;
>        if (cgrp == dummytop)
>                set_bit(CSS_ROOT, &css->flags);
>        BUG_ON(cgrp->subsys[ss->subsys_id]);
> @@ -3343,12 +3338,6 @@ static long cgroup_create(struct cgroup
>                        goto err_destroy;
>                }
>                init_cgroup_css(css, ss, cgrp);
> -               if (ss->use_id) {
> -                       err = alloc_css_id(ss, parent, cgrp);
> -                       if (err)
> -                               goto err_destroy;
> -               }
> -               /* At error, ->destroy() callback has to free assigned ID. */
>        }
>
>        cgroup_lock_hierarchy(root);
> @@ -3710,17 +3699,6 @@ int __init_or_module cgroup_load_subsys(
>
>        /* our new subsystem will be attached to the dummy hierarchy. */
>        init_cgroup_css(css, ss, dummytop);
> -       /* init_idr must be after init_cgroup_css because it sets css->id. */
> -       if (ss->use_id) {
> -               int ret = cgroup_init_idr(ss, css);
> -               if (ret) {
> -                       dummytop->subsys[ss->subsys_id] = NULL;
> -                       ss->destroy(ss, dummytop);
> -                       subsys[i] = NULL;
> -                       mutex_unlock(&cgroup_mutex);
> -                       return ret;
> -               }
> -       }
>
>        /*
>         * Now we need to entangle the css into the existing css_sets. unlike
> @@ -3889,8 +3867,6 @@ int __init cgroup_init(void)
>                struct cgroup_subsys *ss = subsys[i];
>                if (!ss->early_init)
>                        cgroup_init_subsys(ss);
> -               if (ss->use_id)
> -                       cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
>        }
>
>        /* Add init_css_set to the hash table */
> @@ -4604,8 +4580,8 @@ err_out:
>
>  }
>
> -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
> -                                           struct cgroup_subsys_state *rootcss)
> +static int cgroup_init_idr(struct cgroup_subsys *ss,
> +                           struct cgroup_subsys_state *rootcss)
>  {
>        struct css_id *newid;
>
> @@ -4617,21 +4593,25 @@ static int __init_or_module cgroup_init_
>                return PTR_ERR(newid);
>
>        newid->stack[0] = newid->id;
> -       newid->css = rootcss;
> -       rootcss->id = newid;
> +       rcu_assign_pointer(newid->css, rootcss);
> +       rcu_assign_pointer(rootcss->id, newid);
>        return 0;
>  }
>
> -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
> -                       struct cgroup *child)
> +int alloc_css_id(struct cgroup_subsys *ss,
> +       struct cgroup *cgrp, struct cgroup_subsys_state *css)
Must also add EXPORT_SYMBOL_GPL(alloc_css_id) to supported CONFIG_BLK_CGROUP=m.
>  {
>        int subsys_id, i, depth = 0;
> -       struct cgroup_subsys_state *parent_css, *child_css;
> +       struct cgroup_subsys_state *parent_css;
> +       struct cgroup *parent;
>        struct css_id *child_id, *parent_id;
>
> +       if (cgrp == dummytop)
> +               return cgroup_init_idr(ss, css);
> +
> +       parent = cgrp->parent;
>        subsys_id = ss->subsys_id;
>        parent_css = parent->subsys[subsys_id];
> -       child_css = child->subsys[subsys_id];
>        parent_id = parent_css->id;
>        depth = parent_id->depth + 1;
>
> @@ -4646,7 +4626,7 @@ static int alloc_css_id(struct cgroup_su
>         * child_id->css pointer will be set after this cgroup is available
>         * see cgroup_populate_dir()
>         */
> -       rcu_assign_pointer(child_css->id, child_id);
> +       rcu_assign_pointer(css->id, child_id);
>
>        return 0;
>  }
> Index: mmotm-0827/include/linux/cgroup.h
> ===================================================================
> --- mmotm-0827.orig/include/linux/cgroup.h
> +++ mmotm-0827/include/linux/cgroup.h
> @@ -588,9 +588,11 @@ static inline int cgroup_attach_task_cur
>  /*
>  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
>  * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
> - * CSS ID is assigned at cgroup allocation (create) automatically
> - * and removed when subsys calls free_css_id() function. This is because
> - * the lifetime of cgroup_subsys_state is subsys's matter.
> + * CSS ID must be assigned by subsys itself at cgroup creation and deleted
> + * when subsys calls free_css_id() function. This is because the life time of
To be consistent with document: s/life time/lifetime/
> + * of cgroup_subsys_state is subsys's matter.
> + *
> + * ID->css look up is available after cgroup's directory is populated.
>  *
>  * Looking up and scanning function should be called under rcu_read_lock().
>  * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
> @@ -598,10 +600,10 @@ static inline int cgroup_attach_task_cur
>  * destroyed". The caller should check css and cgroup's status.
>  */
>
> -/*
> - * Typically Called at ->destroy(), or somewhere the subsys frees
> - * cgroup_subsys_state.
> - */
> +/* Should be called in ->create() by subsys itself */
> +int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *newgr,
> +               struct cgroup_subsys_state *css);
> +/* Typically Called at ->destroy(), or somewhere the subsys frees css */
s/Called/called/
>  void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
>
>  /* Find a cgroup_subsys_state which has given ID */
> Index: mmotm-0827/mm/memcontrol.c
> ===================================================================
> --- mmotm-0827.orig/mm/memcontrol.c
> +++ mmotm-0827/mm/memcontrol.c
> @@ -4141,6 +4141,11 @@ mem_cgroup_create(struct cgroup_subsys *
>                if (alloc_mem_cgroup_per_zone_info(mem, node))
>                        goto free_out;
>
> +       error = alloc_css_id(ss, cont, &mem->css);
> +       if (error)
> +               goto free_out;
> +       /* Here, css_id(&mem->css) works. but css_lookup(id)->mem doesn't */
> +
>        /* root ? */
>        if (cont->parent == NULL) {
>                int cpu;
> Index: mmotm-0827/block/blk-cgroup.c
> ===================================================================
> --- mmotm-0827.orig/block/blk-cgroup.c
> +++ mmotm-0827/block/blk-cgroup.c
> @@ -958,9 +958,13 @@ blkiocg_create(struct cgroup_subsys *sub
>  {
>        struct blkio_cgroup *blkcg;
>        struct cgroup *parent = cgroup->parent;
> +       int ret;
>
>        if (!parent) {
>                blkcg = &blkio_root_cgroup;
> +               ret = alloc_css_id(subsys, cgroup, &blkcg->css);
> +               if (ret)
> +                       return ERR_PTR(ret);
>                goto done;
>        }
>
> @@ -971,6 +975,11 @@ blkiocg_create(struct cgroup_subsys *sub
>        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>        if (!blkcg)
>                return ERR_PTR(-ENOMEM);
> +       ret = alloc_css_id(subsys, cgroup, &blkcg->css);
> +       if (ret) {
> +               kfree(blkcg);
> +               return ERR_PTR(ret);
> +       }
>
>        blkcg->weight = BLKIO_WEIGHT_DEFAULT;
>  done:
> Index: mmotm-0827/Documentation/cgroups/cgroups.txt
> ===================================================================
> --- mmotm-0827.orig/Documentation/cgroups/cgroups.txt
> +++ mmotm-0827/Documentation/cgroups/cgroups.txt
> @@ -621,6 +621,54 @@ and root cgroup. Currently this will onl
>  the default hierarchy (which never has sub-cgroups) and a hierarchy
>  that is being created/destroyed (and hence has no sub-cgroups).
>
> +3.4 cgroup subsys state IDs.
> +------------
> +When subsystem sets use_id == true, an ID per [cgroup, subsys] is added
> +and it will be tied to cgroup_subsys_state object.
> +
> +When use_id==true can use following interfaces. But please note that
> +allocation/free an ID is subsystem's job because cgroup_subsys_state
> +object's lifetime is subsystem's matter.
> +
> +unsigned short css_id(struct cgroup_subsys_state *css)
> +
> +Returns ID of cgroup_subsys_state
Please add trailing '.' (period character).

> +
> +unsigend short css_depth(struct cgroup_subsys_state *css)
Typo: s/unsigend/unsigned/
> +
> +Returns the level which "css" is exisiting under hierarchy tree.
> +The root cgroup's depth 0, its children are 1, children's children are
> +2....
> +
> +int alloc_css_id(struct struct cgroup_subsys *ss, struct cgroup *newgr,
> +                struct cgroup_subsys_state *css);
> +
> +Attach an new ID to given css under subsystem ([ss, cgroup])
> +should be called in ->create() callback.
> +
> +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
> +
> +Free ID attached to "css" under subsystem. Should be called before
> +"css" is freed.
> +
> +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
> +
> +Look up cgroup_subsys_state via ID. Should be called under rcu_read_lock().
> +
> +struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
> +                struct cgroup_subsys_state *root, int *foundid);
> +
> +Returns ID which is under "root" i.e. under sub-directory of "root"
> +cgroup's directory at considering cgroup hierarchy. The order of IDs
> +returned by this function is not sorted. Please be careful.
> +
> +bool css_is_ancestor(struct cgroup_subsys_state *cg,
> +                     const struct cgroup_subsys_state *root);

To match code: s/cg/child/

> +
> +Returns true if "root" and "cs" is under the same hierarchy and
> +"root" can be found when you see all ->parent from "cs" until
This may be more clear: s/see all/walk all/

> +the root cgroup.
As above: s/cs/child/

> +
>  4. Questions
>  ============
>
>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/5] cgroup: change allocation of css ID placement
  2010-09-09 16:32   ` Greg Thelen
@ 2010-09-09 23:46     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 10+ messages in thread
From: KAMEZAWA Hiroyuki @ 2010-09-09 23:46 UTC (permalink / raw)
  To: Greg Thelen
  Cc: linux-mm, nishimura, akpm, balbir, linux-kernel, Munehiro Ikeda,
	menage, lizf


Thank you for review.

On Thu, 9 Sep 2010 09:32:32 -0700
Greg Thelen <gthelen@google.com> wrote:

> On Tue, Aug 31, 2010 at 11:41 PM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> >
> > Now, css'id is allocated after ->create() is called. But to make use of ID
> > in ->create(), it should be available before ->create().
> >
> > In another thinking, considering the ID is tightly coupled with "css",
> > it should be allocated when "css" is allocated.
> > This patch moves alloc_css_id() to css allocation routine. Now, only 2 subsys,
> > memory and blkio are using ID. (To support complicated hierarchy walk.)
> >
> > ID will be used in mem cgroup's ->create(), later.
> >
> > This patch adds css ID documentation which is not provided.
> >
> > Note:
> > If someone changes rules of css allocation, ID allocation should be changed.
> >
> > Changelog: 2010/09/01
> > A - modified cgroups.txt
> >
> > Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> > Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> > ---
> > A Documentation/cgroups/cgroups.txt | A  48 ++++++++++++++++++++++++++++++++++++
> > A block/blk-cgroup.c A  A  A  A  A  A  A  A | A  A 9 ++++++
> > A include/linux/cgroup.h A  A  A  A  A  A | A  16 ++++++------
> > A kernel/cgroup.c A  A  A  A  A  A  A  A  A  | A  50 +++++++++++---------------------------
> > A mm/memcontrol.c A  A  A  A  A  A  A  A  A  | A  A 5 +++
> > A 5 files changed, 86 insertions(+), 42 deletions(-)
> >
> > Index: mmotm-0827/kernel/cgroup.c
> > ===================================================================
> > --- mmotm-0827.orig/kernel/cgroup.c
> > +++ mmotm-0827/kernel/cgroup.c
> > @@ -289,9 +289,6 @@ struct cg_cgroup_link {
> > A static struct css_set init_css_set;
> > A static struct cg_cgroup_link init_css_set_link;
> >
> > -static int cgroup_init_idr(struct cgroup_subsys *ss,
> > - A  A  A  A  A  A  A  A  A  A  A  A  A struct cgroup_subsys_state *css);
> > -
> > A /* css_set_lock protects the list of css_set objects, and the
> > A * chain of tasks off each css_set. A Nests outside task->alloc_lock
> > A * due to cgroup_iter_start() */
> > @@ -770,9 +767,6 @@ static struct backing_dev_info cgroup_ba
> > A  A  A  A .capabilities A  = BDI_CAP_NO_ACCT_AND_WRITEBACK,
> > A };
> >
> > -static int alloc_css_id(struct cgroup_subsys *ss,
> > - A  A  A  A  A  A  A  A  A  A  A  struct cgroup *parent, struct cgroup *child);
> > -
> > A static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
> > A {
> > A  A  A  A struct inode *inode = new_inode(sb);
> > @@ -3258,7 +3252,8 @@ static void init_cgroup_css(struct cgrou
> > A  A  A  A css->cgroup = cgrp;
> > A  A  A  A atomic_set(&css->refcnt, 1);
> > A  A  A  A css->flags = 0;
> > - A  A  A  css->id = NULL;
> > + A  A  A  if (!ss->use_id)
> > + A  A  A  A  A  A  A  css->id = NULL;
> > A  A  A  A if (cgrp == dummytop)
> > A  A  A  A  A  A  A  A set_bit(CSS_ROOT, &css->flags);
> > A  A  A  A BUG_ON(cgrp->subsys[ss->subsys_id]);
> > @@ -3343,12 +3338,6 @@ static long cgroup_create(struct cgroup
> > A  A  A  A  A  A  A  A  A  A  A  A goto err_destroy;
> > A  A  A  A  A  A  A  A }
> > A  A  A  A  A  A  A  A init_cgroup_css(css, ss, cgrp);
> > - A  A  A  A  A  A  A  if (ss->use_id) {
> > - A  A  A  A  A  A  A  A  A  A  A  err = alloc_css_id(ss, parent, cgrp);
> > - A  A  A  A  A  A  A  A  A  A  A  if (err)
> > - A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  goto err_destroy;
> > - A  A  A  A  A  A  A  }
> > - A  A  A  A  A  A  A  /* At error, ->destroy() callback has to free assigned ID. */
> > A  A  A  A }
> >
> > A  A  A  A cgroup_lock_hierarchy(root);
> > @@ -3710,17 +3699,6 @@ int __init_or_module cgroup_load_subsys(
> >
> > A  A  A  A /* our new subsystem will be attached to the dummy hierarchy. */
> > A  A  A  A init_cgroup_css(css, ss, dummytop);
> > - A  A  A  /* init_idr must be after init_cgroup_css because it sets css->id. */
> > - A  A  A  if (ss->use_id) {
> > - A  A  A  A  A  A  A  int ret = cgroup_init_idr(ss, css);
> > - A  A  A  A  A  A  A  if (ret) {
> > - A  A  A  A  A  A  A  A  A  A  A  dummytop->subsys[ss->subsys_id] = NULL;
> > - A  A  A  A  A  A  A  A  A  A  A  ss->destroy(ss, dummytop);
> > - A  A  A  A  A  A  A  A  A  A  A  subsys[i] = NULL;
> > - A  A  A  A  A  A  A  A  A  A  A  mutex_unlock(&cgroup_mutex);
> > - A  A  A  A  A  A  A  A  A  A  A  return ret;
> > - A  A  A  A  A  A  A  }
> > - A  A  A  }
> >
> > A  A  A  A /*
> > A  A  A  A  * Now we need to entangle the css into the existing css_sets. unlike
> > @@ -3889,8 +3867,6 @@ int __init cgroup_init(void)
> > A  A  A  A  A  A  A  A struct cgroup_subsys *ss = subsys[i];
> > A  A  A  A  A  A  A  A if (!ss->early_init)
> > A  A  A  A  A  A  A  A  A  A  A  A cgroup_init_subsys(ss);
> > - A  A  A  A  A  A  A  if (ss->use_id)
> > - A  A  A  A  A  A  A  A  A  A  A  cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
> > A  A  A  A }
> >
> > A  A  A  A /* Add init_css_set to the hash table */
> > @@ -4604,8 +4580,8 @@ err_out:
> >
> > A }
> >
> > -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
> > - A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  struct cgroup_subsys_state *rootcss)
> > +static int cgroup_init_idr(struct cgroup_subsys *ss,
> > + A  A  A  A  A  A  A  A  A  A  A  A  A  struct cgroup_subsys_state *rootcss)
> > A {
> > A  A  A  A struct css_id *newid;
> >
> > @@ -4617,21 +4593,25 @@ static int __init_or_module cgroup_init_
> > A  A  A  A  A  A  A  A return PTR_ERR(newid);
> >
> > A  A  A  A newid->stack[0] = newid->id;
> > - A  A  A  newid->css = rootcss;
> > - A  A  A  rootcss->id = newid;
> > + A  A  A  rcu_assign_pointer(newid->css, rootcss);
> > + A  A  A  rcu_assign_pointer(rootcss->id, newid);
> > A  A  A  A return 0;
> > A }
> >
> > -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
> > - A  A  A  A  A  A  A  A  A  A  A  struct cgroup *child)
> > +int alloc_css_id(struct cgroup_subsys *ss,
> > + A  A  A  struct cgroup *cgrp, struct cgroup_subsys_state *css)
> Must also add EXPORT_SYMBOL_GPL(alloc_css_id) to supported CONFIG_BLK_CGROUP=m.

Ah, yes. 

> > A {
> > A  A  A  A int subsys_id, i, depth = 0;
> > - A  A  A  struct cgroup_subsys_state *parent_css, *child_css;
> > + A  A  A  struct cgroup_subsys_state *parent_css;
> > + A  A  A  struct cgroup *parent;
> > A  A  A  A struct css_id *child_id, *parent_id;
> >
> > + A  A  A  if (cgrp == dummytop)
> > + A  A  A  A  A  A  A  return cgroup_init_idr(ss, css);
> > +
> > + A  A  A  parent = cgrp->parent;
> > A  A  A  A subsys_id = ss->subsys_id;
> > A  A  A  A parent_css = parent->subsys[subsys_id];
> > - A  A  A  child_css = child->subsys[subsys_id];
> > A  A  A  A parent_id = parent_css->id;
> > A  A  A  A depth = parent_id->depth + 1;
> >
> > @@ -4646,7 +4626,7 @@ static int alloc_css_id(struct cgroup_su
> > A  A  A  A  * child_id->css pointer will be set after this cgroup is available
> > A  A  A  A  * see cgroup_populate_dir()
> > A  A  A  A  */
> > - A  A  A  rcu_assign_pointer(child_css->id, child_id);
> > + A  A  A  rcu_assign_pointer(css->id, child_id);
> >
> > A  A  A  A return 0;
> > A }
> > Index: mmotm-0827/include/linux/cgroup.h
> > ===================================================================
> > --- mmotm-0827.orig/include/linux/cgroup.h
> > +++ mmotm-0827/include/linux/cgroup.h
> > @@ -588,9 +588,11 @@ static inline int cgroup_attach_task_cur
> > A /*
> > A * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
> > A * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
> > - * CSS ID is assigned at cgroup allocation (create) automatically
> > - * and removed when subsys calls free_css_id() function. This is because
> > - * the lifetime of cgroup_subsys_state is subsys's matter.
> > + * CSS ID must be assigned by subsys itself at cgroup creation and deleted
> > + * when subsys calls free_css_id() function. This is because the life time of
> To be consistent with document: s/life time/lifetime/
> > + * of cgroup_subsys_state is subsys's matter.
> > + *
> > + * ID->css look up is available after cgroup's directory is populated.
> > A *
> > A * Looking up and scanning function should be called under rcu_read_lock().
> > A * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
> > @@ -598,10 +600,10 @@ static inline int cgroup_attach_task_cur
> > A * destroyed". The caller should check css and cgroup's status.
> > A */
> >
> > -/*
> > - * Typically Called at ->destroy(), or somewhere the subsys frees
> > - * cgroup_subsys_state.
> > - */
> > +/* Should be called in ->create() by subsys itself */
> > +int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *newgr,
> > + A  A  A  A  A  A  A  struct cgroup_subsys_state *css);
> > +/* Typically Called at ->destroy(), or somewhere the subsys frees css */
> s/Called/called/

will fix.


> > A void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
> >
> > A /* Find a cgroup_subsys_state which has given ID */
> > Index: mmotm-0827/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-0827.orig/mm/memcontrol.c
> > +++ mmotm-0827/mm/memcontrol.c
> > @@ -4141,6 +4141,11 @@ mem_cgroup_create(struct cgroup_subsys *
> > A  A  A  A  A  A  A  A if (alloc_mem_cgroup_per_zone_info(mem, node))
> > A  A  A  A  A  A  A  A  A  A  A  A goto free_out;
> >
> > + A  A  A  error = alloc_css_id(ss, cont, &mem->css);
> > + A  A  A  if (error)
> > + A  A  A  A  A  A  A  goto free_out;
> > + A  A  A  /* Here, css_id(&mem->css) works. but css_lookup(id)->mem doesn't */
> > +
> > A  A  A  A /* root ? */
> > A  A  A  A if (cont->parent == NULL) {
> > A  A  A  A  A  A  A  A int cpu;
> > Index: mmotm-0827/block/blk-cgroup.c
> > ===================================================================
> > --- mmotm-0827.orig/block/blk-cgroup.c
> > +++ mmotm-0827/block/blk-cgroup.c
> > @@ -958,9 +958,13 @@ blkiocg_create(struct cgroup_subsys *sub
> > A {
> > A  A  A  A struct blkio_cgroup *blkcg;
> > A  A  A  A struct cgroup *parent = cgroup->parent;
> > + A  A  A  int ret;
> >
> > A  A  A  A if (!parent) {
> > A  A  A  A  A  A  A  A blkcg = &blkio_root_cgroup;
> > + A  A  A  A  A  A  A  ret = alloc_css_id(subsys, cgroup, &blkcg->css);
> > + A  A  A  A  A  A  A  if (ret)
> > + A  A  A  A  A  A  A  A  A  A  A  return ERR_PTR(ret);
> > A  A  A  A  A  A  A  A goto done;
> > A  A  A  A }
> >
> > @@ -971,6 +975,11 @@ blkiocg_create(struct cgroup_subsys *sub
> > A  A  A  A blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> > A  A  A  A if (!blkcg)
> > A  A  A  A  A  A  A  A return ERR_PTR(-ENOMEM);
> > + A  A  A  ret = alloc_css_id(subsys, cgroup, &blkcg->css);
> > + A  A  A  if (ret) {
> > + A  A  A  A  A  A  A  kfree(blkcg);
> > + A  A  A  A  A  A  A  return ERR_PTR(ret);
> > + A  A  A  }
> >
> > A  A  A  A blkcg->weight = BLKIO_WEIGHT_DEFAULT;
> > A done:
> > Index: mmotm-0827/Documentation/cgroups/cgroups.txt
> > ===================================================================
> > --- mmotm-0827.orig/Documentation/cgroups/cgroups.txt
> > +++ mmotm-0827/Documentation/cgroups/cgroups.txt
> > @@ -621,6 +621,54 @@ and root cgroup. Currently this will onl
> > A the default hierarchy (which never has sub-cgroups) and a hierarchy
> > A that is being created/destroyed (and hence has no sub-cgroups).
> >
> > +3.4 cgroup subsys state IDs.
> > +------------
> > +When subsystem sets use_id == true, an ID per [cgroup, subsys] is added
> > +and it will be tied to cgroup_subsys_state object.
> > +
> > +When use_id==true can use following interfaces. But please note that
> > +allocation/free an ID is subsystem's job because cgroup_subsys_state
> > +object's lifetime is subsystem's matter.
> > +
> > +unsigned short css_id(struct cgroup_subsys_state *css)
> > +
> > +Returns ID of cgroup_subsys_state
> Please add trailing '.' (period character).
> 

will fix.

> > +
> > +unsigend short css_depth(struct cgroup_subsys_state *css)
> Typo: s/unsigend/unsigned/
> > +
> > +Returns the level which "css" is exisiting under hierarchy tree.
> > +The root cgroup's depth 0, its children are 1, children's children are
> > +2....
> > +
> > +int alloc_css_id(struct struct cgroup_subsys *ss, struct cgroup *newgr,
> > + A  A  A  A  A  A  A  A struct cgroup_subsys_state *css);
> > +
> > +Attach an new ID to given css under subsystem ([ss, cgroup])
> > +should be called in ->create() callback.
> > +
> > +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
> > +
> > +Free ID attached to "css" under subsystem. Should be called before
> > +"css" is freed.
> > +
> > +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
> > +
> > +Look up cgroup_subsys_state via ID. Should be called under rcu_read_lock().
> > +
> > +struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
> > + A  A  A  A  A  A  A  A struct cgroup_subsys_state *root, int *foundid);
> > +
> > +Returns ID which is under "root" i.e. under sub-directory of "root"
> > +cgroup's directory at considering cgroup hierarchy. The order of IDs
> > +returned by this function is not sorted. Please be careful.
> > +
> > +bool css_is_ancestor(struct cgroup_subsys_state *cg,
> > + A  A  A  A  A  A  A  A  A  A  const struct cgroup_subsys_state *root);
> 
> To match code: s/cg/child/
> 
will fix.

> > +
> > +Returns true if "root" and "cs" is under the same hierarchy and
> > +"root" can be found when you see all ->parent from "cs" until
> This may be more clear: s/see all/walk all/
> 
> > +the root cgroup.
> As above: s/cs/child/
> 

will fix.

I'll reorder patches and post file-stat ones 1st.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-09-09 23:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-01  6:39 [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki
2010-09-01  6:41 ` [PATCH 1/5] cgroup: change allocation of css ID placement KAMEZAWA Hiroyuki
2010-09-09 16:32   ` Greg Thelen
2010-09-09 23:46     ` KAMEZAWA Hiroyuki
2010-09-01  6:42 ` [PATCH 2/5] memcg: more use of css ID in memcg KAMEZAWA Hiroyuki
2010-09-01  7:52   ` Daisuke Nishimura
2010-09-01  6:43 ` [PATCH 3/5] memcg: use ID in page cgroup KAMEZAWA Hiroyuki
2010-09-01  6:45 ` [PATCH 4/5] memcg: light weight file mapped update lock system KAMEZAWA Hiroyuki
2010-09-01  6:45 ` [PATCH 5/5] memcg: generic file status accounting KAMEZAWA Hiroyuki
2010-09-09  4:44 ` [PATCH 0/5] memcg: towards I/O aware memcg v7 KAMEZAWA Hiroyuki

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox