[-mm] Add an owner to the mm

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [-mm] Add an owner to the mm_struct (v2)
@ 2008-03-28  8:23 Balbir Singh
  2008-03-28  9:41 ` Jiri Slaby
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Balbir Singh @ 2008-03-28  8:23 UTC (permalink / raw)
  To: Paul Menage, Pavel Emelianov
  Cc: Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi, lizf, linux-kernel,
	taka, linux-mm, David Rientjes, Balbir Singh, Andrew Morton,
	KAMEZAWA Hiroyuki


This patch removes the mem_cgroup member from mm_struct and instead adds
an owner. This approach was suggested by Paul Menage. The advantage of
this approach is that, once the mm->owner is known, using the subsystem
id, the cgroup can be determined. It also allows several control groups
that are virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller,
to mm_struct.

The code initially assigns mm->owner to the task and then after the
thread group leader is identified. The mm->owner is changed to the thread
group leader of the task later at the end of copy_process.

A new config option CONFIG_MM_OWNER is added and the memory resource
controller now depends on this config option.

NOTE: This patch was developed on top of 2.6.25-rc5-mm1 and is applied on top
of the memory-controller-move-to-own-slab patch (which is already present
in the Andrew's patchset).

These patches have been tested on a powerpc 64 bit box and on x86_64 box with
several microbenchmarks and some simple memory controller testing.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---

 include/linux/memcontrol.h |   14 ++++++++-
 include/linux/mm_types.h   |    6 ++--
 include/linux/sched.h      |   19 ++++++++++++
 init/Kconfig               |   13 ++++++++
 kernel/exit.c              |   66 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c              |   26 +++++++++++++++++
 mm/memcontrol.c            |   19 +++++++-----
 7 files changed, 151 insertions(+), 12 deletions(-)

diff -puN include/linux/mm_types.h~memory-controller-add-mm-owner include/linux/mm_types.h
--- linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h	2008-03-28 12:26:59.000000000 +0530
@@ -227,8 +227,10 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	struct mem_cgroup *mem_cgroup;
+#ifdef CONFIG_MM_OWNER
+	spinlock_t owner_lock;
+	struct task_struct *owner;	/* The thread group leader that */
+					/* owns the mm_struct.		*/
 #endif
 
 #ifdef CONFIG_PROC_FS
diff -puN kernel/fork.c~memory-controller-add-mm-owner kernel/fork.c
--- linux-2.6.25-rc5/kernel/fork.c~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/kernel/fork.c	2008-03-28 12:33:12.000000000 +0530
@@ -359,6 +359,7 @@ static struct mm_struct * mm_init(struct
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 	mm_init_cgroup(mm, p);
+	mm_init_owner(mm, p);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
@@ -995,6 +996,27 @@ static void rt_mutex_init_task(struct ta
 #endif
 }
 
+#ifdef CONFIG_MM_OWNER
+void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+	spin_lock_init(&mm->owner_lock);
+	mm->owner = p;
+}
+
+void mm_fork_init_owner(struct task_struct *p)
+{
+	struct mm_struct *mm = get_task_mm(p);
+	if (!mm)
+		return;
+
+	spin_lock(&mm->owner);
+	if (mm->owner != p)
+		rcu_assign_pointer(mm->owner, p->group_leader);
+	spin_unlock(&mm->owner);
+	mmput(mm);
+}
+#endif /* CONFIG_MM_OWNER */
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1357,6 +1379,10 @@ static struct task_struct *copy_process(
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
+
+	if (!(clone_flags & CLONE_VM) && (p != p->group_leader))
+		mm_fork_init_owner(p);
+
 	return p;
 
 bad_fork_free_pid:
diff -puN include/linux/memcontrol.h~memory-controller-add-mm-owner include/linux/memcontrol.h
--- linux-2.6.25-rc5/include/linux/memcontrol.h~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/include/linux/memcontrol.h	2008-03-28 09:30:47.000000000 +0530
@@ -29,6 +29,7 @@ struct mm_struct;
 
 extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
 extern void mm_free_cgroup(struct mm_struct *mm);
+extern void mem_cgroup_fork_init(struct task_struct *p);
 
 #define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
 
@@ -49,7 +50,7 @@ extern void mem_cgroup_out_of_memory(str
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
 #define mm_match_cgroup(mm, cgroup)	\
-	((cgroup) == rcu_dereference((mm)->mem_cgroup))
+	((cgroup) == mem_cgroup_from_task((mm)->owner))
 
 extern int mem_cgroup_prepare_migration(struct page *page);
 extern void mem_cgroup_end_migration(struct page *page);
@@ -72,6 +73,8 @@ extern long mem_cgroup_calc_reclaim_acti
 extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
 				struct zone *zone, int priority);
 
+extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 static inline void mm_init_cgroup(struct mm_struct *mm,
 					struct task_struct *p)
@@ -82,6 +85,10 @@ static inline void mm_free_cgroup(struct
 {
 }
 
+static inline void mem_cgroup_fork_init(struct task_struct *p)
+{
+}
+
 static inline void page_reset_bad_cgroup(struct page *page)
 {
 }
@@ -172,6 +179,11 @@ static inline long mem_cgroup_calc_recla
 {
 	return 0;
 }
+
+static void mm_free_fork_cgroup(struct task_struct *p)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff -puN mm/memcontrol.c~memory-controller-add-mm-owner mm/memcontrol.c
--- linux-2.6.25-rc5/mm/memcontrol.c~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/mm/memcontrol.c	2008-03-28 10:15:32.000000000 +0530
@@ -238,7 +238,7 @@ static struct mem_cgroup *mem_cgroup_fro
 				css);
 }
 
-static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
@@ -250,12 +250,17 @@ void mm_init_cgroup(struct mm_struct *mm
 
 	mem = mem_cgroup_from_task(p);
 	css_get(&mem->css);
-	mm->mem_cgroup = mem;
 }
 
 void mm_free_cgroup(struct mm_struct *mm)
 {
-	css_put(&mm->mem_cgroup->css);
+	struct mem_cgroup *mem;
+
+	/*
+	 * TODO: Should we assign mm->owner to NULL here?
+	 */
+	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	css_put(&mem->css);
 }
 
 static inline int page_cgroup_locked(struct page *page)
@@ -478,6 +483,7 @@ unsigned long mem_cgroup_isolate_pages(u
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 
+	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	if (active)
 		src = &mz->active_list;
@@ -575,13 +581,11 @@ retry:
 	if (!mm)
 		mm = &init_mm;
 
-	rcu_read_lock();
-	mem = rcu_dereference(mm->mem_cgroup);
+	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	/*
 	 * For every charge from the cgroup, increment reference count
 	 */
 	css_get(&mem->css);
-	rcu_read_unlock();
 
 	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
 		if (!(gfp_mask & __GFP_WAIT))
@@ -990,8 +994,8 @@ mem_cgroup_create(struct cgroup_subsys *
 
 	if (unlikely((cont->parent) == NULL)) {
 		mem = &init_mem_cgroup;
-		init_mm.mem_cgroup = mem;
 		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
+		init_mm.owner = &init_task;
 	} else
 		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
 
@@ -1072,7 +1076,6 @@ static void mem_cgroup_move_task(struct 
 		goto out;
 
 	css_get(&mem->css);
-	rcu_assign_pointer(mm->mem_cgroup, mem);
 	css_put(&old_mem->css);
 
 out:
diff -puN include/linux/sched.h~memory-controller-add-mm-owner include/linux/sched.h
--- linux-2.6.25-rc5/include/linux/sched.h~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/include/linux/sched.h	2008-03-28 10:50:14.000000000 +0530
@@ -2130,6 +2130,25 @@ static inline void migration_init(void)
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 
+#ifdef CONFIG_MM_OWNER
+extern void mm_update_next_owner(struct mm_struct *mm, struct task_struct *p);
+extern void mm_fork_init_owner(struct task_struct *p);
+extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
+#else
+static inline void
+mm_update_next_owner(struct mm_struct *mm, struct task_struct *p)
+{
+}
+
+static inline void mm_fork_init_owner(struct task_struct *p)
+{
+}
+
+static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+}
+#endif /* CONFIG_MM_OWNER */
+
 #endif /* __KERNEL__ */
 
 #endif
diff -puN kernel/exit.c~memory-controller-add-mm-owner kernel/exit.c
--- linux-2.6.25-rc5/kernel/exit.c~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/kernel/exit.c	2008-03-28 12:35:39.000000000 +0530
@@ -579,6 +579,71 @@ void exit_fs(struct task_struct *tsk)
 
 EXPORT_SYMBOL_GPL(exit_fs);
 
+#ifdef CONFIG_MM_OWNER
+/*
+ * Task p is exiting and it owned p, so lets find a new owner for it
+ */
+static inline int
+mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = (mm && (rcu_dereference(mm->owner) == p) &&
+		(atomic_read(&mm->mm_users) > 1));
+	rcu_read_unlock();
+	return ret;
+}
+
+void mm_update_next_owner(struct mm_struct *mm, struct task_struct *p)
+{
+	struct task_struct *c, *g;
+
+	/*
+	 * This should not be called for init_task
+	 */
+	BUG_ON(p == p->parent);
+
+	if (!mm_need_new_owner(mm, p))
+		return;
+
+	/*
+	 * Search in the children
+	 */
+	list_for_each_entry(c, &p->children, sibling) {
+		if (c->mm == p->mm)
+			goto assign_new_owner;
+	}
+
+	/*
+	 * Search in the siblings
+	 */
+	list_for_each_entry(c, &p->parent->children, sibling) {
+		if (c->mm == p->mm)
+			goto assign_new_owner;
+	}
+
+	/*
+	 * Search through everything else. We should not get
+	 * here often
+	 */
+	for_each_process(c) {
+		g = c;
+		do {
+			if (c->mm && (c->mm == p->mm))
+					goto assign_new_owner;
+		} while ((c = next_thread(c)) != g);
+	}
+
+	BUG();
+
+assign_new_owner:
+	spin_lock(&mm->owner_lock);
+	rcu_assign_pointer(mm->owner, c);
+	spin_unlock(&mm->owner_lock);
+}
+#endif /* CONFIG_MM_OWNER */
+
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
@@ -618,6 +683,7 @@ static void exit_mm(struct task_struct *
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
 	task_unlock(tsk);
+	mm_update_next_owner(mm, current);
 	mmput(mm);
 }
 
diff -puN init/Kconfig~memory-controller-add-mm-owner init/Kconfig
--- linux-2.6.25-rc5/init/Kconfig~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
+++ linux-2.6.25-rc5-balbir/init/Kconfig	2008-03-28 10:08:07.000000000 +0530
@@ -364,9 +364,20 @@ config RESOURCE_COUNTERS
           infrastructure that works with cgroups
 	depends on CGROUPS
 
+config MM_OWNER
+	bool "Enable ownership of mm structure"
+	help
+	  This option enables mm_struct's to have an owner. The advantage
+	  of this approach is that it allows for several independent memory
+	  based cgorup controllers to co-exist independently without too
+	  much space overhead
+
+	  This feature adds fork/exit overhead. So enable this only if
+	  you need resource controllers
+
 config CGROUP_MEM_RES_CTLR
 	bool "Memory Resource Controller for Control Groups"
-	depends on CGROUPS && RESOURCE_COUNTERS
+	depends on CGROUPS && RESOURCE_COUNTERS && MM_OWNER
 	help
 	  Provides a memory resource controller that manages both page cache and
 	  RSS memory.
_

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28  8:23 [-mm] Add an owner to the mm_struct (v2) Balbir Singh
@ 2008-03-28  9:41 ` Jiri Slaby
  2008-03-28  9:43   ` Jiri Slaby
  2008-03-28 10:48 ` KAMEZAWA Hiroyuki
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 23+ messages in thread
From: Jiri Slaby @ 2008-03-28  9:41 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton, KAMEZAWA Hiroyuki

On 03/28/2008 09:23 AM, Balbir Singh wrote:
> This patch removes the mem_cgroup member from mm_struct and instead adds
> an owner. This approach was suggested by Paul Menage. The advantage of
> this approach is that, once the mm->owner is known, using the subsystem
> id, the cgroup can be determined. It also allows several control groups
> that are virtually grouped by mm_struct, to exist independent of the memory
> controller i.e., without adding mem_cgroup's for each controller,
> to mm_struct.
> 
> The code initially assigns mm->owner to the task and then after the
> thread group leader is identified. The mm->owner is changed to the thread
> group leader of the task later at the end of copy_process.
> 
> A new config option CONFIG_MM_OWNER is added and the memory resource
> controller now depends on this config option.
> 
> NOTE: This patch was developed on top of 2.6.25-rc5-mm1 and is applied on top
> of the memory-controller-move-to-own-slab patch (which is already present
> in the Andrew's patchset).
> 
> These patches have been tested on a powerpc 64 bit box and on x86_64 box with
> several microbenchmarks and some simple memory controller testing.
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
> 
>  include/linux/memcontrol.h |   14 ++++++++-
>  include/linux/mm_types.h   |    6 ++--
>  include/linux/sched.h      |   19 ++++++++++++
>  init/Kconfig               |   13 ++++++++
>  kernel/exit.c              |   66 +++++++++++++++++++++++++++++++++++++++++++++
>  kernel/fork.c              |   26 +++++++++++++++++
>  mm/memcontrol.c            |   19 +++++++-----
>  7 files changed, 151 insertions(+), 12 deletions(-)
> 
> diff -puN include/linux/mm_types.h~memory-controller-add-mm-owner include/linux/mm_types.h
> --- linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner	2008-03-28 09:30:47.000000000 +0530
> +++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h	2008-03-28 12:26:59.000000000 +0530
> @@ -227,8 +227,10 @@ struct mm_struct {
>  	/* aio bits */
>  	rwlock_t		ioctx_list_lock;
>  	struct kioctx		*ioctx_list;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> -	struct mem_cgroup *mem_cgroup;
> +#ifdef CONFIG_MM_OWNER
> +	spinlock_t owner_lock;
> +	struct task_struct *owner;	/* The thread group leader that */

Doesn't make sense to switch them (spinlock is unsigned int on x86, what's 
sizeof between and after?)?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28  9:41 ` Jiri Slaby
@ 2008-03-28  9:43   ` Jiri Slaby
  2008-03-28 10:11     ` Balbir Singh
  0 siblings, 1 reply; 23+ messages in thread
From: Jiri Slaby @ 2008-03-28  9:43 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton, KAMEZAWA Hiroyuki

On 03/28/2008 10:41 AM, Jiri Slaby wrote:
>> linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner    
>> 2008-03-28 09:30:47.000000000 +0530
>> +++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h    2008-03-28 
>> 12:26:59.000000000 +0530
>> @@ -227,8 +227,10 @@ struct mm_struct {
>>      /* aio bits */
>>      rwlock_t        ioctx_list_lock;
>>      struct kioctx        *ioctx_list;
>> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
>> -    struct mem_cgroup *mem_cgroup;
>> +#ifdef CONFIG_MM_OWNER
>> +    spinlock_t owner_lock;
>> +    struct task_struct *owner;    /* The thread group leader that */
> 
> Doesn't make sense to switch them (spinlock is unsigned int on x86, 
> what's sizeof between and after?)?

Hmm, doesn't matter, there is another pointer after it, ignore me.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28  9:43   ` Jiri Slaby
@ 2008-03-28 10:11     ` Balbir Singh
  0 siblings, 0 replies; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 10:11 UTC (permalink / raw)
  To: Jiri Slaby
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton, KAMEZAWA Hiroyuki

Jiri Slaby wrote:
> On 03/28/2008 10:41 AM, Jiri Slaby wrote:
>>> linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner   
>>> 2008-03-28 09:30:47.000000000 +0530
>>> +++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h    2008-03-28
>>> 12:26:59.000000000 +0530
>>> @@ -227,8 +227,10 @@ struct mm_struct {
>>>      /* aio bits */
>>>      rwlock_t        ioctx_list_lock;
>>>      struct kioctx        *ioctx_list;
>>> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
>>> -    struct mem_cgroup *mem_cgroup;
>>> +#ifdef CONFIG_MM_OWNER
>>> +    spinlock_t owner_lock;
>>> +    struct task_struct *owner;    /* The thread group leader that */
>>
>> Doesn't make sense to switch them (spinlock is unsigned int on x86,
>> what's sizeof between and after?)?
> 
> Hmm, doesn't matter, there is another pointer after it, ignore me.

OK :)

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28  8:23 [-mm] Add an owner to the mm_struct (v2) Balbir Singh
  2008-03-28  9:41 ` Jiri Slaby
@ 2008-03-28 10:48 ` KAMEZAWA Hiroyuki
  2008-03-28 10:51   ` Balbir Singh
  2008-03-28 10:55 ` KAMEZAWA Hiroyuki
  2008-03-28 11:01 ` Paul Menage
  3 siblings, 1 reply; 23+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-28 10:48 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

On Fri, 28 Mar 2008 13:53:16 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> 
> 
> This patch removes the mem_cgroup member from mm_struct and instead adds
> an owner. This approach was suggested by Paul Menage. The advantage of
> this approach is that, once the mm->owner is known, using the subsystem
> id, the cgroup can be determined. It also allows several control groups
> that are virtually grouped by mm_struct, to exist independent of the memory
> controller i.e., without adding mem_cgroup's for each controller,
> to mm_struct.
> 
> The code initially assigns mm->owner to the task and then after the
> thread group leader is identified. The mm->owner is changed to the thread
> group leader of the task later at the end of copy_process.
> 
Hmm, I like this approach. 

-a bit off topic-
BTW, could you move mem_cgroup_from_task() to include/linux/memcontrol.h ?

Then, I'll add an interface like
mem_cgroup_charge_xxx(struct page *page, struct mem_cgroup *mem, gfp_mask mask)

This can be called in following way:
mem_cgroup_charge_xxx(page, mem_cgroup_from_task(current), GFP_XXX);
and I don't have to access mm_struct's member in this case.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 10:48 ` KAMEZAWA Hiroyuki
@ 2008-03-28 10:51   ` Balbir Singh
  2008-03-28 11:06     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 10:51 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

KAMEZAWA Hiroyuki wrote:
> On Fri, 28 Mar 2008 13:53:16 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
>>
>> This patch removes the mem_cgroup member from mm_struct and instead adds
>> an owner. This approach was suggested by Paul Menage. The advantage of
>> this approach is that, once the mm->owner is known, using the subsystem
>> id, the cgroup can be determined. It also allows several control groups
>> that are virtually grouped by mm_struct, to exist independent of the memory
>> controller i.e., without adding mem_cgroup's for each controller,
>> to mm_struct.
>>
>> The code initially assigns mm->owner to the task and then after the
>> thread group leader is identified. The mm->owner is changed to the thread
>> group leader of the task later at the end of copy_process.
>>
> Hmm, I like this approach. 
> 

Thanks,

> -a bit off topic-
> BTW, could you move mem_cgroup_from_task() to include/linux/memcontrol.h ?
> 

Yes, that can be done

> Then, I'll add an interface like
> mem_cgroup_charge_xxx(struct page *page, struct mem_cgroup *mem, gfp_mask mask)
> 
> This can be called in following way:
> mem_cgroup_charge_xxx(page, mem_cgroup_from_task(current), GFP_XXX);
> and I don't have to access mm_struct's member in this case.
> 

OK. Will do. Can that wait until Andrew picks up these patches. Then I'll put
that as an add-on?

Thanks for the review

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 10:51   ` Balbir Singh
@ 2008-03-28 11:06     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 23+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-28 11:06 UTC (permalink / raw)
  To: balbir
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

On Fri, 28 Mar 2008 16:21:16 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> > -a bit off topic-
> > BTW, could you move mem_cgroup_from_task() to include/linux/memcontrol.h ?
> > 
> 
> Yes, that can be done
> 
> > Then, I'll add an interface like
> > mem_cgroup_charge_xxx(struct page *page, struct mem_cgroup *mem, gfp_mask mask)
> > 
> > This can be called in following way:
> > mem_cgroup_charge_xxx(page, mem_cgroup_from_task(current), GFP_XXX);
> > and I don't have to access mm_struct's member in this case.
> > 
> 
> OK. Will do. Can that wait until Andrew picks up these patches. Then I'll put
> that as an add-on?
> 
Of course, I can wait.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28  8:23 [-mm] Add an owner to the mm_struct (v2) Balbir Singh
  2008-03-28  9:41 ` Jiri Slaby
  2008-03-28 10:48 ` KAMEZAWA Hiroyuki
@ 2008-03-28 10:55 ` KAMEZAWA Hiroyuki
  2008-03-28 10:52   ` Balbir Singh
  2008-03-28 11:01 ` Paul Menage
  3 siblings, 1 reply; 23+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-28 10:55 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

On Fri, 28 Mar 2008 13:53:16 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> -static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
> +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>  {
>  	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
>  				struct mem_cgroup, css);
> @@ -250,12 +250,17 @@ void mm_init_cgroup(struct mm_struct *mm
>  
>  	mem = mem_cgroup_from_task(p);
>  	css_get(&mem->css);
> -	mm->mem_cgroup = mem;
>  }
>  
>  void mm_free_cgroup(struct mm_struct *mm)
>  {
> -	css_put(&mm->mem_cgroup->css);
> +	struct mem_cgroup *mem;
> +
> +	/*
> +	 * TODO: Should we assign mm->owner to NULL here?
> +	 */
> +	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
> +	css_put(&mem->css);
>  }
>  
How about changing this css_get()/css_put() from accounting against mm_struct
to accouting against task_struct ?
It seems simpler way after this mm->owner change.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 10:55 ` KAMEZAWA Hiroyuki
@ 2008-03-28 10:52   ` Balbir Singh
  2008-03-28 11:04     ` Paul Menage
  2008-03-28 11:15     ` KAMEZAWA Hiroyuki
  0 siblings, 2 replies; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 10:52 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

KAMEZAWA Hiroyuki wrote:
> On Fri, 28 Mar 2008 13:53:16 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> -static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>> +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>>  {
>>  	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
>>  				struct mem_cgroup, css);
>> @@ -250,12 +250,17 @@ void mm_init_cgroup(struct mm_struct *mm
>>  
>>  	mem = mem_cgroup_from_task(p);
>>  	css_get(&mem->css);
>> -	mm->mem_cgroup = mem;
>>  }
>>  
>>  void mm_free_cgroup(struct mm_struct *mm)
>>  {
>> -	css_put(&mm->mem_cgroup->css);
>> +	struct mem_cgroup *mem;
>> +
>> +	/*
>> +	 * TODO: Should we assign mm->owner to NULL here?
>> +	 */
>> +	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
>> +	css_put(&mem->css);
>>  }
>>  
> How about changing this css_get()/css_put() from accounting against mm_struct
> to accouting against task_struct ?
> It seems simpler way after this mm->owner change.

But the reason why we account the mem_cgroup is that we don't want the
mem_cgroup to be deleted. I hope you meant mem_cgroup instead of mm_struct.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 10:52   ` Balbir Singh
@ 2008-03-28 11:04     ` Paul Menage
  2008-03-28 11:15     ` KAMEZAWA Hiroyuki
  1 sibling, 0 replies; 23+ messages in thread
From: Paul Menage @ 2008-03-28 11:04 UTC (permalink / raw)
  To: balbir
  Cc: KAMEZAWA Hiroyuki, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

On Fri, Mar 28, 2008 at 3:52 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>  > How about changing this css_get()/css_put() from accounting against mm_struct
>  > to accouting against task_struct ?
>  > It seems simpler way after this mm->owner change.
>
>  But the reason why we account the mem_cgroup is that we don't want the
>  mem_cgroup to be deleted. I hope you meant mem_cgroup instead of mm_struct.
>

If there are any tasks in the cgroup then the cgroup can't be deleted,
and hence the mem_cgroup is safe.

css_get()/css_put() is only needed when you have a reference from a
non-task object that needs to keep the mem_cgroup alive, which is no
longer the case for mm_struct once we have mm->owner.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 10:52   ` Balbir Singh
  2008-03-28 11:04     ` Paul Menage
@ 2008-03-28 11:15     ` KAMEZAWA Hiroyuki
  2008-03-28 11:21       ` KAMEZAWA Hiroyuki
  1 sibling, 1 reply; 23+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-28 11:15 UTC (permalink / raw)
  To: balbir
  Cc: Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

On Fri, 28 Mar 2008 16:22:48 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> > How about changing this css_get()/css_put() from accounting against mm_struct
> > to accouting against task_struct ?
> > It seems simpler way after this mm->owner change.
> 
> But the reason why we account the mem_cgroup is that we don't want the
> mem_cgroup to be deleted. I hope you meant mem_cgroup instead of mm_struct.
> 
Ah, my text was complicated.

Now,
- css_get(memcgrp) is called at mm_struct initialization.
- css_put(memcgrp) is called at mm_struct freeing.

How about
- css_get(memcgrp) is called at task_struct initialization.
- css_put(memcgrp) is called at task_struct freeing.

Because
1. we find mem_cgroup by mm->owner, after this.
2. generic group interface have exit() and clone() callback interface.

mem_cgroup will not be freed until rmdir(), anyway.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 11:15     ` KAMEZAWA Hiroyuki
@ 2008-03-28 11:21       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 23+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-28 11:21 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: balbir, Paul Menage, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton

On Fri, 28 Mar 2008 20:15:28 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> Now,
> - css_get(memcgrp) is called at mm_struct initialization.
> - css_put(memcgrp) is called at mm_struct freeing.
> 
> How about
> - css_get(memcgrp) is called at task_struct initialization.
> - css_put(memcgrp) is called at task_struct freeing.
> 
> Because
> 1. we find mem_cgroup by mm->owner, after this.
> 2. generic group interface have exit() and clone() callback interface.
> 
> mem_cgroup will not be freed until rmdir(), anyway.
> 
Ignore above. As Paul pointed out, reference count from task is not necessary.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28  8:23 [-mm] Add an owner to the mm_struct (v2) Balbir Singh
                   ` (2 preceding siblings ...)
  2008-03-28 10:55 ` KAMEZAWA Hiroyuki
@ 2008-03-28 11:01 ` Paul Menage
  2008-03-28 12:36   ` Balbir Singh
  3 siblings, 1 reply; 23+ messages in thread
From: Paul Menage @ 2008-03-28 11:01 UTC (permalink / raw)
  To: Balbir Singh
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

On Fri, Mar 28, 2008 at 1:23 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>  diff -puN include/linux/mm_types.h~memory-controller-add-mm-owner include/linux/mm_types.h
>  --- linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner    2008-03-28 09:30:47.000000000 +0530
>  +++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h    2008-03-28 12:26:59.000000000 +0530
>  @@ -227,8 +227,10 @@ struct mm_struct {
>         /* aio bits */
>         rwlock_t                ioctx_list_lock;
>         struct kioctx           *ioctx_list;
>  -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
>  -       struct mem_cgroup *mem_cgroup;
>  +#ifdef CONFIG_MM_OWNER
>  +       spinlock_t owner_lock;
>  +       struct task_struct *owner;      /* The thread group leader that */
>  +                                       /* owns the mm_struct.          */
>   #endif

I'm not convinced that we need the spinlock. Just use the simple rule
that you can only modify mm->owner if:

- mm->owner points to current
- the new owner is a user of mm
- you hold task_lock() for the new owner (which is necessary anyway to
ensure that the new owner's mm doesn't change while you're updating
mm->owner)

and I think everything is fine without an additional lock.

>
>   #ifdef CONFIG_PROC_FS
>  diff -puN kernel/fork.c~memory-controller-add-mm-owner kernel/fork.c
>  --- linux-2.6.25-rc5/kernel/fork.c~memory-controller-add-mm-owner       2008-03-28 09:30:47.000000000 +0530
>  +++ linux-2.6.25-rc5-balbir/kernel/fork.c       2008-03-28 12:33:12.000000000 +0530
>  @@ -359,6 +359,7 @@ static struct mm_struct * mm_init(struct
>         mm->free_area_cache = TASK_UNMAPPED_BASE;
>         mm->cached_hole_size = ~0UL;
>         mm_init_cgroup(mm, p);
>  +       mm_init_owner(mm, p);
>
>         if (likely(!mm_alloc_pgd(mm))) {
>                 mm->def_flags = 0;
>  @@ -995,6 +996,27 @@ static void rt_mutex_init_task(struct ta
>   #endif
>   }
>
>  +#ifdef CONFIG_MM_OWNER
>  +void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
>  +{
>  +       spin_lock_init(&mm->owner_lock);
>  +       mm->owner = p;
>  +}
>  +
>  +void mm_fork_init_owner(struct task_struct *p)
>  +{
>  +       struct mm_struct *mm = get_task_mm(p);

Do we need this? p->mm can't go away if we're in the middle of forking it.

>  +       if (!mm)
>  +               return;
>  +
>  +       spin_lock(&mm->owner);

I suspect that you meant this to be spin_lock(&mm->owner_lock).

>  +       if (mm->owner != p)
>  +               rcu_assign_pointer(mm->owner, p->group_leader);
>  +       spin_unlock(&mm->owner);
>  +       mmput(mm);
>  +}
>  +#endif /* CONFIG_MM_OWNER */
>  +
>   /*
>   * This creates a new process as a copy of the old one,
>   * but does not actually start it yet.
>  @@ -1357,6 +1379,10 @@ static struct task_struct *copy_process(
>         write_unlock_irq(&tasklist_lock);
>         proc_fork_connector(p);
>         cgroup_post_fork(p);
>  +
>  +       if (!(clone_flags & CLONE_VM) && (p != p->group_leader))
>  +               mm_fork_init_owner(p);
>  +

I'm not sure I understand what this is doing.

I read it as "if p has its own mm and p is a child thread, set
p->mm->owner to p->group_leader". But by definition if p has its own
mm, then p->group_leader->mm will be different to p->mm, therefore
we'd end up with mm->owner->mm != mm, which seems very bad.

What's the intention of this bit of code?

>         return p;
>
>   bad_fork_free_pid:
>  diff -puN include/linux/memcontrol.h~memory-controller-add-mm-owner include/linux/memcontrol.h
>  --- linux-2.6.25-rc5/include/linux/memcontrol.h~memory-controller-add-mm-owner  2008-03-28 09:30:47.000000000 +0530
>  +++ linux-2.6.25-rc5-balbir/include/linux/memcontrol.h  2008-03-28 09:30:47.000000000 +0530
>  @@ -29,6 +29,7 @@ struct mm_struct;
>
>   extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
>   extern void mm_free_cgroup(struct mm_struct *mm);
>  +extern void mem_cgroup_fork_init(struct task_struct *p);
>
>   #define page_reset_bad_cgroup(page)    ((page)->page_cgroup = 0)
>
>  @@ -49,7 +50,7 @@ extern void mem_cgroup_out_of_memory(str
>   int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
>
>   #define mm_match_cgroup(mm, cgroup)    \
>  -       ((cgroup) == rcu_dereference((mm)->mem_cgroup))
>  +       ((cgroup) == mem_cgroup_from_task((mm)->owner))
>
>   extern int mem_cgroup_prepare_migration(struct page *page);
>   extern void mem_cgroup_end_migration(struct page *page);
>  @@ -72,6 +73,8 @@ extern long mem_cgroup_calc_reclaim_acti
>   extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
>                                 struct zone *zone, int priority);
>
>  +extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
>  +
>   #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>   static inline void mm_init_cgroup(struct mm_struct *mm,
>                                         struct task_struct *p)
>  @@ -82,6 +85,10 @@ static inline void mm_free_cgroup(struct
>   {
>   }
>
>  +static inline void mem_cgroup_fork_init(struct task_struct *p)
>  +{
>  +}
>  +

Is this stale?

>   static inline void page_reset_bad_cgroup(struct page *page)
>   {
>   }
>  @@ -172,6 +179,11 @@ static inline long mem_cgroup_calc_recla
>   {
>         return 0;
>   }
>  +
>  +static void mm_free_fork_cgroup(struct task_struct *p)
>  +{
>  +}
>  +

And this?

>   #endif /* CONFIG_CGROUP_MEM_CONT */
>  -static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>  +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>   {
>         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
>                                 struct mem_cgroup, css);

I think it would be better to make this static inline in the header
file - it's just two indexed dereferences, so hardly worth the
function call overhead.

>  @@ -250,12 +250,17 @@ void mm_init_cgroup(struct mm_struct *mm
>
>         mem = mem_cgroup_from_task(p);
>         css_get(&mem->css);
>  -       mm->mem_cgroup = mem;
>   }
>
>   void mm_free_cgroup(struct mm_struct *mm)
>   {
>  -       css_put(&mm->mem_cgroup->css);
>  +       struct mem_cgroup *mem;
>  +
>  +       /*
>  +        * TODO: Should we assign mm->owner to NULL here?

No, controller code shouldn't be changing mm->owner.

And surely we don't need mm_init_cgroup() and mm_free_cgroup() any longer?

>
>  -       rcu_read_lock();
>  -       mem = rcu_dereference(mm->mem_cgroup);
>  +       mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
>         /*
>          * For every charge from the cgroup, increment reference count
>          */
>         css_get(&mem->css);
>  -       rcu_read_unlock();

Why is it OK to take away the rcu_read_lock() here? We're still doing
an rcu_dereference().

>
>         while (res_counter_charge(&mem->res, PAGE_SIZE)) {
>                 if (!(gfp_mask & __GFP_WAIT))
>  @@ -990,8 +994,8 @@ mem_cgroup_create(struct cgroup_subsys *
>
>         if (unlikely((cont->parent) == NULL)) {
>                 mem = &init_mem_cgroup;
>  -               init_mm.mem_cgroup = mem;
>                 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
>  +               init_mm.owner = &init_task;

This shouldn't be in here - it should be in the core code that sets up init_mm.

>
>  +#ifdef CONFIG_MM_OWNER
>  +extern void mm_update_next_owner(struct mm_struct *mm, struct task_struct *p);
>  +extern void mm_fork_init_owner(struct task_struct *p);
>  +extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
>  +#else
>  +static inline void
>  +mm_update_next_owner(struct mm_struct *mm, struct task_struct *p)
>  +{
>  +}
>  +
>  +static inline void mm_fork_init_owner(struct task_struct *p)
>  +{
>  +}
>  +
>  +static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
>  +{
>  +}
>  +#endif /* CONFIG_MM_OWNER */
>  +
>   #endif /* __KERNEL__ */
>
>   #endif
>  diff -puN kernel/exit.c~memory-controller-add-mm-owner kernel/exit.c
>  --- linux-2.6.25-rc5/kernel/exit.c~memory-controller-add-mm-owner       2008-03-28 09:30:47.000000000 +0530
>  +++ linux-2.6.25-rc5-balbir/kernel/exit.c       2008-03-28 12:35:39.000000000 +0530
>  @@ -579,6 +579,71 @@ void exit_fs(struct task_struct *tsk)
>
>   EXPORT_SYMBOL_GPL(exit_fs);
>
>  +#ifdef CONFIG_MM_OWNER
>  +/*
>  + * Task p is exiting and it owned p, so lets find a new owner for it
>  + */
>  +static inline int
>  +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
>  +{
>  +       int ret;
>  +
>  +       rcu_read_lock();
>  +       ret = (mm && (rcu_dereference(mm->owner) == p) &&
>  +               (atomic_read(&mm->mm_users) > 1));
>  +       rcu_read_unlock();
>  +       return ret;

The only way that rcu_read_lock() helps here is if mm freeing is
protected by RCU, which I don't think is the case.

But as long as p==current, there's no race, since no other process
will re-point mm->owner at themselves, so mm can't go away anyway
since we have a reference to it that we're going to be dropping soon.

Is there ever a case where we'd want to call this on anything other
than current? It would simplify the code to just refer to current
rather than tsk.

>  +}
>  +
>  +void mm_update_next_owner(struct mm_struct *mm, struct task_struct *p)
>  +{
>  +       struct task_struct *c, *g;
>  +
>  +       /*
>  +        * This should not be called for init_task
>  +        */
>  +       BUG_ON(p == p->parent);

I'd be inclined to make this BUG_ON(p != current), or just have p as a
local variable initialized from current. (If you're trying to save
multiple calls to current on arches where it's not just a simple
register).

>  +
>  +       if (!mm_need_new_owner(mm, p))
>  +               return;
>  +
>  +       /*
>  +        * Search in the children
>  +        */
>  +       list_for_each_entry(c, &p->children, sibling) {
>  +               if (c->mm == p->mm)
>  +                       goto assign_new_owner;
>  +       }

We need to keep checking mm_need_new_owner() since it can become false
if the only other user of the mm exits at the same time that we do.
(In which case there's nothing to do).

>  +        * Search through everything else. We should not get
>  +        * here often
>  +        */
>  +       for_each_process(c) {
>  +               g = c;
>  +               do {
>  +                       if (c->mm && (c->mm == p->mm))
>  +                                       goto assign_new_owner;
>  +               } while ((c = next_thread(c)) != g);
>  +       }

Is there a reason to not code this as for_each_thread?

>  +
>  +       BUG();
>  +
>  +assign_new_owner:
>  +       spin_lock(&mm->owner_lock);
>  +       rcu_assign_pointer(mm->owner, c);
>  +       spin_unlock(&mm->owner_lock);
>  +}

This can break if c is also exiting and has passed the call to
mm_update_next_owner() by the time we assign mm->owner. That's why my
original suggested version had a function like:

static inline void try_give_mm_ownership(struct task_struct *task,
struct mm_struct *mm) {
  if (task->mm != mm) return;
  task_lock(task);
  if (task->mm == mm) {
    mm->owner = task;
  }
  task_unlock(task);
}

i.e. determining that a task is a valid candidate and updating the
owner pointer has to be done in the same critical section.

Also, looking forward to when we have the virtual AS limits
controller, in the (unlikely?) event that the new owner is in a
different virtual AS limit control group, this code will need to be
able to handle shifting the mm->total_mm from the old AS cgroup to the
new one. That's the "fiddly layer violation" that I mentioned earlier.

It might be cleaner to be able to specify on a per-subsystem basis
whether we require that all users of an mm be in the same cgroup.

>   config CGROUP_MEM_RES_CTLR
>         bool "Memory Resource Controller for Control Groups"
>  -       depends on CGROUPS && RESOURCE_COUNTERS
>  +       depends on CGROUPS && RESOURCE_COUNTERS && MM_OWNER

Maybe this should select MM_OWNER rather than depending on it?

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 11:01 ` Paul Menage
@ 2008-03-28 12:36   ` Balbir Singh
  2008-03-28 12:54     ` Balbir Singh
  2008-03-28 14:05     ` Paul Menage
  0 siblings, 2 replies; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 12:36 UTC (permalink / raw)
  To: Paul Menage
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Fri, Mar 28, 2008 at 1:23 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>  diff -puN include/linux/mm_types.h~memory-controller-add-mm-owner include/linux/mm_types.h
>>  --- linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner    2008-03-28 09:30:47.000000000 +0530
>>  +++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h    2008-03-28 12:26:59.000000000 +0530
>>  @@ -227,8 +227,10 @@ struct mm_struct {
>>         /* aio bits */
>>         rwlock_t                ioctx_list_lock;
>>         struct kioctx           *ioctx_list;
>>  -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
>>  -       struct mem_cgroup *mem_cgroup;
>>  +#ifdef CONFIG_MM_OWNER
>>  +       spinlock_t owner_lock;
>>  +       struct task_struct *owner;      /* The thread group leader that */
>>  +                                       /* owns the mm_struct.          */
>>   #endif
> 
> I'm not convinced that we need the spinlock. Just use the simple rule
> that you can only modify mm->owner if:
> 
> - mm->owner points to current
> - the new owner is a user of mm

This will always hold, otherwise it cannot be the new owner :)

> - you hold task_lock() for the new owner (which is necessary anyway to
> ensure that the new owner's mm doesn't change while you're updating
> mm->owner)
> 

tsk->mm should not change unless the task is exiting or when a kernel thread
does use_mm() (PF_BORROWED_MM).

I see mm->owner changing when

1. The mm->owner exits
2. At fork time for clone calls with CLONE_VM

May be your rules will work, but I am yet to try that out.

>>   #ifdef CONFIG_PROC_FS
>>  diff -puN kernel/fork.c~memory-controller-add-mm-owner kernel/fork.c
>>  --- linux-2.6.25-rc5/kernel/fork.c~memory-controller-add-mm-owner       2008-03-28 09:30:47.000000000 +0530
>>  +++ linux-2.6.25-rc5-balbir/kernel/fork.c       2008-03-28 12:33:12.000000000 +0530
>>  @@ -359,6 +359,7 @@ static struct mm_struct * mm_init(struct
>>         mm->free_area_cache = TASK_UNMAPPED_BASE;
>>         mm->cached_hole_size = ~0UL;
>>         mm_init_cgroup(mm, p);
>>  +       mm_init_owner(mm, p);
>>
>>         if (likely(!mm_alloc_pgd(mm))) {
>>                 mm->def_flags = 0;
>>  @@ -995,6 +996,27 @@ static void rt_mutex_init_task(struct ta
>>   #endif
>>   }
>>
>>  +#ifdef CONFIG_MM_OWNER
>>  +void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
>>  +{
>>  +       spin_lock_init(&mm->owner_lock);
>>  +       mm->owner = p;
>>  +}
>>  +
>>  +void mm_fork_init_owner(struct task_struct *p)
>>  +{
>>  +       struct mm_struct *mm = get_task_mm(p);
> 
> Do we need this? p->mm can't go away if we're in the middle of forking it.
> 

There are other cases why I preferred to use it, specifically for PF_BORROWED_MM
cases. What if a kernel thread does use_mm() and fork()? Very unlikely, I'll
remove it and use p->mm directly.

>>  +       if (!mm)
>>  +               return;
>>  +
>>  +       spin_lock(&mm->owner);
> 
> I suspect that you meant this to be spin_lock(&mm->owner_lock).
> 

Yes, thats a typo. A bad one :)

>>  +       if (mm->owner != p)
>>  +               rcu_assign_pointer(mm->owner, p->group_leader);
>>  +       spin_unlock(&mm->owner);
>>  +       mmput(mm);
>>  +}
>>  +#endif /* CONFIG_MM_OWNER */
>>  +
>>   /*
>>   * This creates a new process as a copy of the old one,
>>   * but does not actually start it yet.
>>  @@ -1357,6 +1379,10 @@ static struct task_struct *copy_process(
>>         write_unlock_irq(&tasklist_lock);
>>         proc_fork_connector(p);
>>         cgroup_post_fork(p);
>>  +
>>  +       if (!(clone_flags & CLONE_VM) && (p != p->group_leader))
>>  +               mm_fork_init_owner(p);
>>  +
> 
> I'm not sure I understand what this is doing.
> 
> I read it as "if p has its own mm and p is a child thread, set
> p->mm->owner to p->group_leader". But by definition if p has its own
> mm, then p->group_leader->mm will be different to p->mm, therefore
> we'd end up with mm->owner->mm != mm, which seems very bad.
> 
> What's the intention of this bit of code?
> 

The intention is to handle the case when clone is called without CLONE_VM and
with CLONE_THREAD. This means that p can have it's own mm and a shared group_leader.


>>         return p;
>>
>>   bad_fork_free_pid:
>>  diff -puN include/linux/memcontrol.h~memory-controller-add-mm-owner include/linux/memcontrol.h
>>  --- linux-2.6.25-rc5/include/linux/memcontrol.h~memory-controller-add-mm-owner  2008-03-28 09:30:47.000000000 +0530
>>  +++ linux-2.6.25-rc5-balbir/include/linux/memcontrol.h  2008-03-28 09:30:47.000000000 +0530
>>  @@ -29,6 +29,7 @@ struct mm_struct;
>>
>>   extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
>>   extern void mm_free_cgroup(struct mm_struct *mm);
>>  +extern void mem_cgroup_fork_init(struct task_struct *p);
>>
>>   #define page_reset_bad_cgroup(page)    ((page)->page_cgroup = 0)
>>
>>  @@ -49,7 +50,7 @@ extern void mem_cgroup_out_of_memory(str
>>   int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
>>
>>   #define mm_match_cgroup(mm, cgroup)    \
>>  -       ((cgroup) == rcu_dereference((mm)->mem_cgroup))
>>  +       ((cgroup) == mem_cgroup_from_task((mm)->owner))
>>
>>   extern int mem_cgroup_prepare_migration(struct page *page);
>>   extern void mem_cgroup_end_migration(struct page *page);
>>  @@ -72,6 +73,8 @@ extern long mem_cgroup_calc_reclaim_acti
>>   extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
>>                                 struct zone *zone, int priority);
>>
>>  +extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
>>  +
>>   #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>>   static inline void mm_init_cgroup(struct mm_struct *mm,
>>                                         struct task_struct *p)
>>  @@ -82,6 +85,10 @@ static inline void mm_free_cgroup(struct
>>   {
>>   }
>>
>>  +static inline void mem_cgroup_fork_init(struct task_struct *p)
>>  +{
>>  +}
>>  +
> 
> Is this stale?
> 

Yes, there is some stale code in mm/memcontrol.c and include/linux/memcontrol.h
(my bad)

>>   static inline void page_reset_bad_cgroup(struct page *page)
>>   {
>>   }
>>  @@ -172,6 +179,11 @@ static inline long mem_cgroup_calc_recla
>>   {
>>         return 0;
>>   }
>>  +
>>  +static void mm_free_fork_cgroup(struct task_struct *p)
>>  +{
>>  +}
>>  +
> 
> And this?
> 

Yes

>>   #endif /* CONFIG_CGROUP_MEM_CONT */
>>  -static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>>  +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>>   {
>>         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
>>                                 struct mem_cgroup, css);
> 
> I think it would be better to make this static inline in the header
> file - it's just two indexed dereferences, so hardly worth the
> function call overhead.
> 

Yes, will do

>>  @@ -250,12 +250,17 @@ void mm_init_cgroup(struct mm_struct *mm
>>
>>         mem = mem_cgroup_from_task(p);
>>         css_get(&mem->css);
>>  -       mm->mem_cgroup = mem;
>>   }
>>
>>   void mm_free_cgroup(struct mm_struct *mm)
>>   {
>>  -       css_put(&mm->mem_cgroup->css);
>>  +       struct mem_cgroup *mem;
>>  +
>>  +       /*
>>  +        * TODO: Should we assign mm->owner to NULL here?
> 
> No, controller code shouldn't be changing mm->owner.
> 
> And surely we don't need mm_init_cgroup() and mm_free_cgroup() any longer?
> 

We don't need it and the comment is stale :)

>>  -       rcu_read_lock();
>>  -       mem = rcu_dereference(mm->mem_cgroup);
>>  +       mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
>>         /*
>>          * For every charge from the cgroup, increment reference count
>>          */
>>         css_get(&mem->css);
>>  -       rcu_read_unlock();
> 
> Why is it OK to take away the rcu_read_lock() here? We're still doing
> an rcu_dereference().
> 

Nope, my bad -- stale code. Will fix

>>         while (res_counter_charge(&mem->res, PAGE_SIZE)) {
>>                 if (!(gfp_mask & __GFP_WAIT))
>>  @@ -990,8 +994,8 @@ mem_cgroup_create(struct cgroup_subsys *
>>
>>         if (unlikely((cont->parent) == NULL)) {
>>                 mem = &init_mem_cgroup;
>>  -               init_mm.mem_cgroup = mem;
>>                 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
>>  +               init_mm.owner = &init_task;
> 
> This shouldn't be in here - it should be in the core code that sets up init_mm.
> 

Yes, good point. Will do

>>  +#ifdef CONFIG_MM_OWNER
>>  +extern void mm_update_next_owner(struct mm_struct *mm, struct task_struct *p);
>>  +extern void mm_fork_init_owner(struct task_struct *p);
>>  +extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
>>  +#else
>>  +static inline void
>>  +mm_update_next_owner(struct mm_struct *mm, struct task_struct *p)
>>  +{
>>  +}
>>  +
>>  +static inline void mm_fork_init_owner(struct task_struct *p)
>>  +{
>>  +}
>>  +
>>  +static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
>>  +{
>>  +}
>>  +#endif /* CONFIG_MM_OWNER */
>>  +
>>   #endif /* __KERNEL__ */
>>
>>   #endif
>>  diff -puN kernel/exit.c~memory-controller-add-mm-owner kernel/exit.c
>>  --- linux-2.6.25-rc5/kernel/exit.c~memory-controller-add-mm-owner       2008-03-28 09:30:47.000000000 +0530
>>  +++ linux-2.6.25-rc5-balbir/kernel/exit.c       2008-03-28 12:35:39.000000000 +0530
>>  @@ -579,6 +579,71 @@ void exit_fs(struct task_struct *tsk)
>>
>>   EXPORT_SYMBOL_GPL(exit_fs);
>>
>>  +#ifdef CONFIG_MM_OWNER
>>  +/*
>>  + * Task p is exiting and it owned p, so lets find a new owner for it
>>  + */
>>  +static inline int
>>  +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
>>  +{
>>  +       int ret;
>>  +
>>  +       rcu_read_lock();
>>  +       ret = (mm && (rcu_dereference(mm->owner) == p) &&
>>  +               (atomic_read(&mm->mm_users) > 1));
>>  +       rcu_read_unlock();
>>  +       return ret;
> 
> The only way that rcu_read_lock() helps here is if mm freeing is
> protected by RCU, which I don't think is the case.
> 

rcu_read_lock() also ensures that preemption does not cause us to see incorrect
values.

> But as long as p==current, there's no race, since no other process
> will re-point mm->owner at themselves, so mm can't go away anyway
> since we have a reference to it that we're going to be dropping soon.
>

mm cannot go away, but mm->owner can be different from current and could be
going away.

> Is there ever a case where we'd want to call this on anything other
> than current? It would simplify the code to just refer to current
> rather than tsk.
> 

Not at the moment, yes, we can remove the second parameter

>>  +}
>>  +
>>  +void mm_update_next_owner(struct mm_struct *mm, struct task_struct *p)
>>  +{
>>  +       struct task_struct *c, *g;
>>  +
>>  +       /*
>>  +        * This should not be called for init_task
>>  +        */
>>  +       BUG_ON(p == p->parent);
> 
> I'd be inclined to make this BUG_ON(p != current), or just have p as a
> local variable initialized from current. (If you're trying to save
> multiple calls to current on arches where it's not just a simple
> register).
> 

OK

>>  +
>>  +       if (!mm_need_new_owner(mm, p))
>>  +               return;
>>  +
>>  +       /*
>>  +        * Search in the children
>>  +        */
>>  +       list_for_each_entry(c, &p->children, sibling) {
>>  +               if (c->mm == p->mm)
>>  +                       goto assign_new_owner;
>>  +       }
> 
> We need to keep checking mm_need_new_owner() since it can become false
> if the only other user of the mm exits at the same time that we do.
> (In which case there's nothing to do).
> 

I would rather deal with the case where mm->owner is NULL, rather than keep
checking (since even with constant checking we cannot guarantee that mm->owner
will not become NULL)

>>  +        * Search through everything else. We should not get
>>  +        * here often
>>  +        */
>>  +       for_each_process(c) {
>>  +               g = c;
>>  +               do {
>>  +                       if (c->mm && (c->mm == p->mm))
>>  +                                       goto assign_new_owner;
>>  +               } while ((c = next_thread(c)) != g);
>>  +       }
> 
> Is there a reason to not code this as for_each_thread?
> 

Is there a for_each_thread()?

>>  +
>>  +       BUG();
>>  +
>>  +assign_new_owner:
>>  +       spin_lock(&mm->owner_lock);
>>  +       rcu_assign_pointer(mm->owner, c);
>>  +       spin_unlock(&mm->owner_lock);
>>  +}
> 
> This can break if c is also exiting and has passed the call to
> mm_update_next_owner() by the time we assign mm->owner. That's why my
> original suggested version had a function like:
> 

Won't it better to check for c->flags & PF_EXITING?

> static inline void try_give_mm_ownership(struct task_struct *task,
> struct mm_struct *mm) {
>   if (task->mm != mm) return;
>   task_lock(task);
>   if (task->mm == mm) {
>     mm->owner = task;
>   }
>   task_unlock(task);
> }
> 
> i.e. determining that a task is a valid candidate and updating the
> owner pointer has to be done in the same critical section.
> 

Let me try and revamp the locking rules and see what that leads to. But, I don't
like protecting an mm_struct's member with a task_struct's lock.


> Also, looking forward to when we have the virtual AS limits
> controller, in the (unlikely?) event that the new owner is in a
> different virtual AS limit control group, this code will need to be
> able to handle shifting the mm->total_mm from the old AS cgroup to the
> new one. That's the "fiddly layer violation" that I mentioned earlier.
> 
> It might be cleaner to be able to specify on a per-subsystem basis
> whether we require that all users of an mm be in the same cgroup.
> 

I don't expect to see that kind of restriction.

>>   config CGROUP_MEM_RES_CTLR
>>         bool "Memory Resource Controller for Control Groups"
>>  -       depends on CGROUPS && RESOURCE_COUNTERS
>>  +       depends on CGROUPS && RESOURCE_COUNTERS && MM_OWNER
> 
> Maybe this should select MM_OWNER rather than depending on it?
> 

I thought of it, but wondered if the user should make an informed choice about
MM_OWNER and the overhead it brings along.

> Paul
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 12:36   ` Balbir Singh
@ 2008-03-28 12:54     ` Balbir Singh
  2008-03-28 14:06       ` Paul Menage
  2008-03-28 14:05     ` Paul Menage
  1 sibling, 1 reply; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 12:54 UTC (permalink / raw)
  To: Paul Menage
  Cc: balbir, Pavel Emelianov, Hugh Dickins, Sudhir Kumar,
	YAMAMOTO Takashi, lizf, linux-kernel, taka, linux-mm,
	David Rientjes, Andrew Morton, KAMEZAWA Hiroyuki

Balbir Singh wrote:
> Paul Menage wrote:
>> On Fri, Mar 28, 2008 at 1:23 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>>  diff -puN include/linux/mm_types.h~memory-controller-add-mm-owner include/linux/mm_types.h
>>>  --- linux-2.6.25-rc5/include/linux/mm_types.h~memory-controller-add-mm-owner    2008-03-28 09:30:47.000000000 +0530
>>>  +++ linux-2.6.25-rc5-balbir/include/linux/mm_types.h    2008-03-28 12:26:59.000000000 +0530
>>>  @@ -227,8 +227,10 @@ struct mm_struct {
>>>         /* aio bits */
>>>         rwlock_t                ioctx_list_lock;
>>>         struct kioctx           *ioctx_list;
>>>  -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
>>>  -       struct mem_cgroup *mem_cgroup;
>>>  +#ifdef CONFIG_MM_OWNER
>>>  +       spinlock_t owner_lock;
>>>  +       struct task_struct *owner;      /* The thread group leader that */
>>>  +                                       /* owns the mm_struct.          */
>>>   #endif
>> I'm not convinced that we need the spinlock. Just use the simple rule
>> that you can only modify mm->owner if:
>>
>> - mm->owner points to current
>> - the new owner is a user of mm
> 
> This will always hold, otherwise it cannot be the new owner :)
> 
>> - you hold task_lock() for the new owner (which is necessary anyway to
>> ensure that the new owner's mm doesn't change while you're updating
>> mm->owner)
>>

Thinking more, I don't think it makes sense for us to overload task_lock() to do
the mm->owner handling (we don't want to mix lock domains). task_lock() is used
for several things

1. We don't want to make task_lock() rules more complicated by having it protect
an mm member to save space
2. We don't want more contention on task_lock()

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 12:54     ` Balbir Singh
@ 2008-03-28 14:06       ` Paul Menage
  0 siblings, 0 replies; 23+ messages in thread
From: Paul Menage @ 2008-03-28 14:06 UTC (permalink / raw)
  To: balbir
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

On Fri, Mar 28, 2008 at 5:54 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>  Thinking more, I don't think it makes sense for us to overload task_lock() to do
>  the mm->owner handling (we don't want to mix lock domains). task_lock() is used
>  for several things
>
>  1. We don't want to make task_lock() rules more complicated by having it protect
>  an mm member to save space
>  2. We don't want more contention on task_lock()
>

This isn't to save space, it's to provide correctness. We *have* to
hold task_lock(new_owner) before setting mm->owner = new_owner,
otherwise we have no guarantee that new_owner is still a user of mm.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 12:36   ` Balbir Singh
  2008-03-28 12:54     ` Balbir Singh
@ 2008-03-28 14:05     ` Paul Menage
  2008-03-28 14:52       ` Balbir Singh
  1 sibling, 1 reply; 23+ messages in thread
From: Paul Menage @ 2008-03-28 14:05 UTC (permalink / raw)
  To: balbir
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

On Fri, Mar 28, 2008 at 5:36 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>  > - you hold task_lock() for the new owner (which is necessary anyway to
>  > ensure that the new owner's mm doesn't change while you're updating
>  > mm->owner)
>  >
>
>  tsk->mm should not change unless the task is exiting or when a kernel thread
>  does use_mm() (PF_BORROWED_MM).
>

Or the task calls execve().

>  I see mm->owner changing when
>
>  1. The mm->owner exits
>  2. At fork time for clone calls with CLONE_VM

Why would a clone() call with CLONE_VM change mm->owner? We're sharing
with the existing owner, who is still using the mm, so we should leave
it as it is.

I don't see what invariant you're trying to protect with
mm->owner_lock. Can you explain it?

>  >>  @@ -1357,6 +1379,10 @@ static struct task_struct *copy_process(
>  >>         write_unlock_irq(&tasklist_lock);
>  >>         proc_fork_connector(p);
>  >>         cgroup_post_fork(p);
>  >>  +
>  >>  +       if (!(clone_flags & CLONE_VM) && (p != p->group_leader))
>  >>  +               mm_fork_init_owner(p);
>  >>  +
>  >
>  > I'm not sure I understand what this is doing.
>  >
>  > I read it as "if p has its own mm and p is a child thread, set
>  > p->mm->owner to p->group_leader". But by definition if p has its own
>  > mm, then p->group_leader->mm will be different to p->mm, therefore
>  > we'd end up with mm->owner->mm != mm, which seems very bad.
>  >
>  > What's the intention of this bit of code?
>  >
>
>  The intention is to handle the case when clone is called without CLONE_VM and
>  with CLONE_THREAD. This means that p can have it's own mm and a shared group_leader.
>

In which case p should be the owner of its new mm, not
p->group_leader. mm_fork_init_owner() does:

>  +       if (mm->owner != p)
>  +               rcu_assign_pointer(mm->owner, p->group_leader);

Since we just created a fresh mm for p in copy_mm(), and set mm->owner
= p, I don't see how this test can succeed. And if it does, then we
end up setting p->mm->owner to p->group_leader, which has to be bad
since we know that:

- p is the only user of its mm
- p != p->group_leader

therefore we're setting p->mm->owner to point to a process that
doesn't use p->mm. What am I missing?

>  >>  +/*
>  >>  + * Task p is exiting and it owned p, so lets find a new owner for it
>  >>  + */
>  >>  +static inline int
>  >>  +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
>  >>  +{
>  >>  +       int ret;
>  >>  +
>  >>  +       rcu_read_lock();
>  >>  +       ret = (mm && (rcu_dereference(mm->owner) == p) &&
>  >>  +               (atomic_read(&mm->mm_users) > 1));
>  >>  +       rcu_read_unlock();
>  >>  +       return ret;
>  >
>  > The only way that rcu_read_lock() helps here is if mm freeing is
>  > protected by RCU, which I don't think is the case.
>  >
>
>  rcu_read_lock() also ensures that preemption does not cause us to see incorrect
>  values.
>

The only thing that RCU is protecting us against here is that if
mm->owner is non-NULL, the task it points to won't be freed until the
end of our RCU read section. But since we never dereference mm->owner
in the RCU read section, that buys us nothing and is just confusing.

>
>  > But as long as p==current, there's no race, since no other process
>  > will re-point mm->owner at themselves, so mm can't go away anyway
>  > since we have a reference to it that we're going to be dropping soon.
>  >
>
>  mm cannot go away, but mm->owner can be different from current and could be
>  going away.

But that's a problem for mm->owner, not for current.

By this point we've already cleared current->mm, so we're no longer a
candidate for becoming mm->owner if we're not already the owner.
Therefore the truth or falsity of (mm->owner == current) can't be
changed by races - either we are already the owner and no-one but us
can change mm->owner, or we're not the owner and no one can point
mm->owner at us.

>  >>  +
>  >>  +       if (!mm_need_new_owner(mm, p))
>  >>  +               return;
>  >>  +
>  >>  +       /*
>  >>  +        * Search in the children
>  >>  +        */
>  >>  +       list_for_each_entry(c, &p->children, sibling) {
>  >>  +               if (c->mm == p->mm)

Oh, and at the point in exit_mm() when you call
mm_update_next_owner(), p->mm is NULL, so this will only match against
tasks that have no mm.

>  >>  +                       goto assign_new_owner;
>  >>  +       }
>  >
>  > We need to keep checking mm_need_new_owner() since it can become false
>  > if the only other user of the mm exits at the same time that we do.
>  > (In which case there's nothing to do).
>  >
>
>  I would rather deal with the case where mm->owner is NULL, rather than keep
>  checking

I mean that it's possible that there's one other task using mm at the
point when we enter mm_update_next_owner(), but while we're trying to
find it, it does an exit() or execve() and stops being a user of mm.
if that happens, we should bail from the search since we no longer
need to find another user to pass it to (and there is no other user to
find).

> (since even with constant checking we cannot guarantee that mm->owner
>  will not become NULL)
>
>
>  >>  +        * Search through everything else. We should not get
>  >>  +        * here often
>  >>  +        */
>  >>  +       for_each_process(c) {
>  >>  +               g = c;
>  >>  +               do {
>  >>  +                       if (c->mm && (c->mm == p->mm))
>  >>  +                                       goto assign_new_owner;
>  >>  +               } while ((c = next_thread(c)) != g);
>  >>  +       }
>  >
>  > Is there a reason to not code this as for_each_thread?
>  >
>
>  Is there a for_each_thread()?
>

Sorry, I meant do_each_thread()/while_each_thread(). Isn't that
basically the same thing as what you've explicitly coded here?

>  >>  +assign_new_owner:
>  >>  +       spin_lock(&mm->owner_lock);
>  >>  +       rcu_assign_pointer(mm->owner, c);
>  >>  +       spin_unlock(&mm->owner_lock);
>  >>  +}
>  >
>  > This can break if c is also exiting and has passed the call to
>  > mm_update_next_owner() by the time we assign mm->owner. That's why my
>  > original suggested version had a function like:
>  >
>
>  Won't it better to check for c->flags & PF_EXITING?
>

Or if c is execing and gives up the mm that way.

>
>  > static inline void try_give_mm_ownership(struct task_struct *task,
>  > struct mm_struct *mm) {
>  >   if (task->mm != mm) return;
>  >   task_lock(task);
>  >   if (task->mm == mm) {
>  >     mm->owner = task;
>  >   }
>  >   task_unlock(task);
>  > }
>  >
>  > i.e. determining that a task is a valid candidate and updating the
>  > owner pointer has to be done in the same critical section.
>  >
>
>  Let me try and revamp the locking rules and see what that leads to. But, I don't
>  like protecting an mm_struct's member with a task_struct's lock.

We're not protecting mm->owner with task_lock(new_owner). As I said,
the locking rule for mm->owner should that it can only be changed if
(current==mm->owner). What we're protecting is new_owner->mm - i.e.
we're ensuring that we don't break the invariant that (mm->owner->mm
== mm)

>  >>   config CGROUP_MEM_RES_CTLR
>  >>         bool "Memory Resource Controller for Control Groups"
>  >>  -       depends on CGROUPS && RESOURCE_COUNTERS
>  >>  +       depends on CGROUPS && RESOURCE_COUNTERS && MM_OWNER
>  >
>  > Maybe this should select MM_OWNER rather than depending on it?
>  >
>
>  I thought of it, but wondered if the user should make an informed choice about
>  MM_OWNER and the overhead it brings along.

I would vote the other way on that - make it clear in the memory
controller config help that the memory controller can introduce
overhead, and have it implicitly enable MM_OWNER.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 14:05     ` Paul Menage
@ 2008-03-28 14:52       ` Balbir Singh
  2008-03-28 15:38         ` Paul Menage
  0 siblings, 1 reply; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 14:52 UTC (permalink / raw)
  To: Paul Menage
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Fri, Mar 28, 2008 at 5:36 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>  > - you hold task_lock() for the new owner (which is necessary anyway to
>>  > ensure that the new owner's mm doesn't change while you're updating
>>  > mm->owner)
>>  >
>>
>>  tsk->mm should not change unless the task is exiting or when a kernel thread
>>  does use_mm() (PF_BORROWED_MM).
>>
> 
> Or the task calls execve().
> 
>>  I see mm->owner changing when
>>
>>  1. The mm->owner exits
>>  2. At fork time for clone calls with CLONE_VM
> 

This was supposed to be deleted, ignore it

> Why would a clone() call with CLONE_VM change mm->owner? We're sharing
> with the existing owner, who is still using the mm, so we should leave
> it as it is.
> 
> I don't see what invariant you're trying to protect with
> mm->owner_lock. Can you explain it?
> 

mm->owner_lock is there to protect mm->owner field from changing simultaneously
as tasks fork/exit.

>>  >>  @@ -1357,6 +1379,10 @@ static struct task_struct *copy_process(
>>  >>         write_unlock_irq(&tasklist_lock);
>>  >>         proc_fork_connector(p);
>>  >>         cgroup_post_fork(p);
>>  >>  +
>>  >>  +       if (!(clone_flags & CLONE_VM) && (p != p->group_leader))
>>  >>  +               mm_fork_init_owner(p);
>>  >>  +
>>  >
>>  > I'm not sure I understand what this is doing.
>>  >
>>  > I read it as "if p has its own mm and p is a child thread, set
>>  > p->mm->owner to p->group_leader". But by definition if p has its own
>>  > mm, then p->group_leader->mm will be different to p->mm, therefore
>>  > we'd end up with mm->owner->mm != mm, which seems very bad.
>>  >
>>  > What's the intention of this bit of code?
>>  >
>>
>>  The intention is to handle the case when clone is called without CLONE_VM and
>>  with CLONE_THREAD. This means that p can have it's own mm and a shared group_leader.
>>
> 
> In which case p should be the owner of its new mm, not
> p->group_leader. mm_fork_init_owner() does:
> 

Oh! yes.. my bad again. The check should have been p == p->thread_group, but
that is not required either. The check should now ideally be

if (!(clone_flags & CLONE_VM))

>>  +       if (mm->owner != p)
>>  +               rcu_assign_pointer(mm->owner, p->group_leader);
> 
> Since we just created a fresh mm for p in copy_mm(), and set mm->owner
> = p, I don't see how this test can succeed. And if it does, then we
> end up setting p->mm->owner to p->group_leader, which has to be bad
> since we know that:
> 
> - p is the only user of its mm
> - p != p->group_leader
> 
> therefore we're setting p->mm->owner to point to a process that
> doesn't use p->mm. What am I missing?
> 
>>  >>  +/*
>>  >>  + * Task p is exiting and it owned p, so lets find a new owner for it
>>  >>  + */
>>  >>  +static inline int
>>  >>  +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
>>  >>  +{
>>  >>  +       int ret;
>>  >>  +
>>  >>  +       rcu_read_lock();
>>  >>  +       ret = (mm && (rcu_dereference(mm->owner) == p) &&
>>  >>  +               (atomic_read(&mm->mm_users) > 1));
>>  >>  +       rcu_read_unlock();
>>  >>  +       return ret;
>>  >
>>  > The only way that rcu_read_lock() helps here is if mm freeing is
>>  > protected by RCU, which I don't think is the case.
>>  >
>>
>>  rcu_read_lock() also ensures that preemption does not cause us to see incorrect
>>  values.
>>
> 
> The only thing that RCU is protecting us against here is that if
> mm->owner is non-NULL, the task it points to won't be freed until the
> end of our RCU read section. But since we never dereference mm->owner
> in the RCU read section, that buys us nothing and is just confusing.
> 
>>  > But as long as p==current, there's no race, since no other process
>>  > will re-point mm->owner at themselves, so mm can't go away anyway
>>  > since we have a reference to it that we're going to be dropping soon.
>>  >
>>
>>  mm cannot go away, but mm->owner can be different from current and could be
>>  going away.
> 
> But that's a problem for mm->owner, not for current.
> 
> By this point we've already cleared current->mm, so we're no longer a
> candidate for becoming mm->owner if we're not already the owner.
> Therefore the truth or falsity of (mm->owner == current) can't be
> changed by races - either we are already the owner and no-one but us
> can change mm->owner, or we're not the owner and no one can point
> mm->owner at us.
> 
>>  >>  +
>>  >>  +       if (!mm_need_new_owner(mm, p))
>>  >>  +               return;
>>  >>  +
>>  >>  +       /*
>>  >>  +        * Search in the children
>>  >>  +        */
>>  >>  +       list_for_each_entry(c, &p->children, sibling) {
>>  >>  +               if (c->mm == p->mm)
> 
> Oh, and at the point in exit_mm() when you call
> mm_update_next_owner(), p->mm is NULL, so this will only match against
> tasks that have no mm.
> 

Yes.. I think we need to call it earlier.

>>  >>  +                       goto assign_new_owner;
>>  >>  +       }
>>  >
>>  > We need to keep checking mm_need_new_owner() since it can become false
>>  > if the only other user of the mm exits at the same time that we do.
>>  > (In which case there's nothing to do).
>>  >
>>
>>  I would rather deal with the case where mm->owner is NULL, rather than keep
>>  checking
> 
> I mean that it's possible that there's one other task using mm at the
> point when we enter mm_update_next_owner(), but while we're trying to
> find it, it does an exit() or execve() and stops being a user of mm.
> if that happens, we should bail from the search since we no longer
> need to find another user to pass it to (and there is no other user to
> find).
> 
>> (since even with constant checking we cannot guarantee that mm->owner
>>  will not become NULL)
>>
>>
>>  >>  +        * Search through everything else. We should not get
>>  >>  +        * here often
>>  >>  +        */
>>  >>  +       for_each_process(c) {
>>  >>  +               g = c;
>>  >>  +               do {
>>  >>  +                       if (c->mm && (c->mm == p->mm))
>>  >>  +                                       goto assign_new_owner;
>>  >>  +               } while ((c = next_thread(c)) != g);
>>  >>  +       }
>>  >
>>  > Is there a reason to not code this as for_each_thread?
>>  >
>>
>>  Is there a for_each_thread()?
>>
> 
> Sorry, I meant do_each_thread()/while_each_thread(). Isn't that
> basically the same thing as what you've explicitly coded here?
> 

Yes, it is.

>>  >>  +assign_new_owner:
>>  >>  +       spin_lock(&mm->owner_lock);
>>  >>  +       rcu_assign_pointer(mm->owner, c);
>>  >>  +       spin_unlock(&mm->owner_lock);
>>  >>  +}
>>  >
>>  > This can break if c is also exiting and has passed the call to
>>  > mm_update_next_owner() by the time we assign mm->owner. That's why my
>>  > original suggested version had a function like:
>>  >
>>
>>  Won't it better to check for c->flags & PF_EXITING?
>>
> 
> Or if c is execing and gives up the mm that way.
> 



>>  > static inline void try_give_mm_ownership(struct task_struct *task,
>>  > struct mm_struct *mm) {
>>  >   if (task->mm != mm) return;
>>  >   task_lock(task);
>>  >   if (task->mm == mm) {
>>  >     mm->owner = task;
>>  >   }
>>  >   task_unlock(task);
>>  > }
>>  >
>>  > i.e. determining that a task is a valid candidate and updating the
>>  > owner pointer has to be done in the same critical section.
>>  >
>>
>>  Let me try and revamp the locking rules and see what that leads to. But, I don't
>>  like protecting an mm_struct's member with a task_struct's lock.
> 
> We're not protecting mm->owner with task_lock(new_owner). As I said,
> the locking rule for mm->owner should that it can only be changed if
> (current==mm->owner). What we're protecting is new_owner->mm - i.e.
> we're ensuring that we don't break the invariant that (mm->owner->mm
> == mm)
> 

But there is no way to guarantee that, what is the new_owner exec's after we've
done the check and assigned. Won't we end up breaking the invariant? How about
we have mm_update_new_owner() call in exec_mmap() as well? That way, we can
still use owner_lock and keep the invariant.


>>  >>   config CGROUP_MEM_RES_CTLR
>>  >>         bool "Memory Resource Controller for Control Groups"
>>  >>  -       depends on CGROUPS && RESOURCE_COUNTERS
>>  >>  +       depends on CGROUPS && RESOURCE_COUNTERS && MM_OWNER
>>  >
>>  > Maybe this should select MM_OWNER rather than depending on it?
>>  >
>>
>>  I thought of it, but wondered if the user should make an informed choice about
>>  MM_OWNER and the overhead it brings along.
> 
> I would vote the other way on that - make it clear in the memory
> controller config help that the memory controller can introduce
> overhead, and have it implicitly enable MM_OWNER.
> 

That's a possibility

Thanks for your patience and review.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 14:52       ` Balbir Singh
@ 2008-03-28 15:38         ` Paul Menage
  2008-03-28 18:10           ` Balbir Singh
  0 siblings, 1 reply; 23+ messages in thread
From: Paul Menage @ 2008-03-28 15:38 UTC (permalink / raw)
  To: balbir
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

On Fri, Mar 28, 2008 at 7:52 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>  mm->owner_lock is there to protect mm->owner field from changing simultaneously
>  as tasks fork/exit.
>

But the *hardware* already does that for you - individual writes to
pointers are already atomic operations and so will be serialized.
Using a lock to guard something only does anything useful if at least
one of the critical regions that takes the lock consists of more than
a single atomic operation, or if you have a mixture of read sections
and write sections. Now it's true that your critical region in
mm_fork_init_owner() is more than a single atomic op, but I'm arguing
below that it's a no-op. So that just leaves the single region

spin_lock(&mm->owner_lock);
mm->owner = new_owner;
spin_unlock(&mm->owner_lock);

which isn't observably different if you remove the spinlock.

>
>  Oh! yes.. my bad again. The check should have been p == p->thread_group, but
>  that is not required either. The check should now ideally be
>
>  if (!(clone_flags & CLONE_VM))
>

OK, so if the new thread has its own mm (and hence will already have
mm->owner set up to point to p in mm_init()) then we do:

>  +       if (mm->owner != p)
>  +               rcu_assign_pointer(mm->owner, p->group_leader);

which is a no-op since we know mm->owner == p.

>
>  Yes.. I think we need to call it earlier.
>

No, I think we need to call it later - after we've cleared current->mm
(from within task_lock(current)) - so we can't rely on p->mm in this
function, we have to pass it in. If we call it before while
current->mm == mm, then we risk a race where the (new or existing)
owner exits and passes it back to us *after* we've done a check to see
if we need to find a new owner. If we ensure that current->mm != mm
before we call mm_update_next_owner(), then we know we're not a
candidate for receiving the ownership if we don't have it already.

>
>  But there is no way to guarantee that, what is the new_owner exec's after we've
>  done the check and assigned. Won't we end up breaking the invariant? How about
>  we have mm_update_new_owner() call in exec_mmap() as well? That way, we can
>  still use owner_lock and keep the invariant.
>

Oops, I thought that exit_mm() already got called in the execve()
path, but you're right, it doesn't.

Yes, exit_mmap() should call mm_update_next_owner() after the call to
task_unlock(), i.e. after it's set its new mm.

So I need to express the invariant more carefully.

What we need to preserve is that, for every mm at all times, mm->owner
points to a valid task. So either:

1) mm->owner->mm == mm AND mm->owner will check to see whether it
needs to pass ownership before it exits or execs.

OR

2) mm->owner is the last user of mm and is about to free mm.

OR

3) mm->owner is currently searching for another user of mm to pass the
ownership to.

In order to get from state 3 to state 1 safely we have to hold
task_lock(new_owner). Otherwise we can race with an exit or exec in
new_owner, resulting in a process that has already passed the point of
checking current->mm->owner.

I don't see why we need mm->owner_lock to maintain this invariant.
(But am quite prepared to be proven wrong).

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 15:38         ` Paul Menage
@ 2008-03-28 18:10           ` Balbir Singh
  2008-03-28 18:52             ` Paul Menage
  0 siblings, 1 reply; 23+ messages in thread
From: Balbir Singh @ 2008-03-28 18:10 UTC (permalink / raw)
  To: Paul Menage
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

Paul Menage wrote:
> On Fri, Mar 28, 2008 at 7:52 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>>  mm->owner_lock is there to protect mm->owner field from changing simultaneously
>>  as tasks fork/exit.
>>
> 
> But the *hardware* already does that for you - individual writes to
> pointers are already atomic operations and so will be serialized.
> Using a lock to guard something only does anything useful if at least
> one of the critical regions that takes the lock consists of more than
> a single atomic operation, or if you have a mixture of read sections
> and write sections. Now it's true that your critical region in
> mm_fork_init_owner() is more than a single atomic op, but I'm arguing
> below that it's a no-op. So that just leaves the single region
> 
> spin_lock(&mm->owner_lock);
> mm->owner = new_owner;
> spin_unlock(&mm->owner_lock);
> 
> which isn't observably different if you remove the spinlock.
> 

At fork time, we can have do_fork() run in parallel and we need to protect
mm->owner, if several threads are created at the same time. We don't want to
overwrite mm->owner for each thread that is created.

>>  Oh! yes.. my bad again. The check should have been p == p->thread_group, but
>>  that is not required either. The check should now ideally be
>>
>>  if (!(clone_flags & CLONE_VM))
>>
> 
> OK, so if the new thread has its own mm (and hence will already have
> mm->owner set up to point to p in mm_init()) then we do:
> 
>>  +       if (mm->owner != p)
>>  +               rcu_assign_pointer(mm->owner, p->group_leader);
> 
> which is a no-op since we know mm->owner == p.
> 
>>  Yes.. I think we need to call it earlier.
>>
> 
> No, I think we need to call it later - after we've cleared current->mm
> (from within task_lock(current)) - so we can't rely on p->mm in this
> function, we have to pass it in. If we call it before while
> current->mm == mm, then we risk a race where the (new or existing)
> owner exits and passes it back to us *after* we've done a check to see
> if we need to find a new owner. If we ensure that current->mm != mm
> before we call mm_update_next_owner(), then we know we're not a
> candidate for receiving the ownership if we don't have it already.
> 

Yes and we could also check for flags & PF_EXITING

>>  But there is no way to guarantee that, what is the new_owner exec's after we've
>>  done the check and assigned. Won't we end up breaking the invariant? How about
>>  we have mm_update_new_owner() call in exec_mmap() as well? That way, we can
>>  still use owner_lock and keep the invariant.
>>
> 
> Oops, I thought that exit_mm() already got called in the execve()
> path, but you're right, it doesn't.
> 
> Yes, exit_mmap() should call mm_update_next_owner() after the call to
> task_unlock(), i.e. after it's set its new mm.
> 
> So I need to express the invariant more carefully.
> 
> What we need to preserve is that, for every mm at all times, mm->owner
> points to a valid task. So either:
> 
> 1) mm->owner->mm == mm AND mm->owner will check to see whether it
> needs to pass ownership before it exits or execs.
> 
> OR
> 
> 2) mm->owner is the last user of mm and is about to free mm.
> 
> OR
> 
> 3) mm->owner is currently searching for another user of mm to pass the
> ownership to.
> 
> In order to get from state 3 to state 1 safely we have to hold
> task_lock(new_owner). Otherwise we can race with an exit or exec in
> new_owner, resulting in a process that has already passed the point of
> checking current->mm->owner.
> 

No.. like you said if we do it after current->mm has changed and is different
from mm, then it's safe to find a new owner. I still don't see why we need
task_lock(new_owner). Even if we have task_lock(new_owner), it can still exit or
exec later.

> I don't see why we need mm->owner_lock to maintain this invariant.
> (But am quite prepared to be proven wrong).
> 

Why mix task_lock() to protect mm->owner? owner_lock can provide the protection
you are talking about.


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 18:10           ` Balbir Singh
@ 2008-03-28 18:52             ` Paul Menage
  2008-03-29  1:02               ` Balbir Singh
  2008-03-29  5:46               ` Balbir Singh
  0 siblings, 2 replies; 23+ messages in thread
From: Paul Menage @ 2008-03-28 18:52 UTC (permalink / raw)
  To: balbir
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

Hi Balbir,

Could you send out the latest version of your patch? I suspect it's
changed a bit based on on this review and it would be good to make
sure we're both on the same page.

On Fri, Mar 28, 2008 at 11:10 AM, Balbir Singh
<balbir@linux.vnet.ibm.com> wrote:
>
>  At fork time, we can have do_fork() run in parallel and we need to protect
>  mm->owner, if several threads are created at the same time. We don't want to
>  overwrite mm->owner for each thread that is created.

Why would you want to overwrite mm->owner for any of the threads? If
they're sharing an existing mm, then that mm must already have an
owner, so no need to update it.

>  > No, I think we need to call it later - after we've cleared current->mm
>  > (from within task_lock(current)) - so we can't rely on p->mm in this
>  > function, we have to pass it in. If we call it before while
>  > current->mm == mm, then we risk a race where the (new or existing)
>  > owner exits and passes it back to us *after* we've done a check to see
>  > if we need to find a new owner. If we ensure that current->mm != mm
>  > before we call mm_update_next_owner(), then we know we're not a
>  > candidate for receiving the ownership if we don't have it already.
>  >
>
>  Yes and we could also check for flags & PF_EXITING
>

A couple of issues with that:

- I'm not sure how that handles the exec case

- assume two users; the owner exits and wants to pass the ownership to
the other user. It finds it, but sees that it's PF_EXITING. What
should it do? If it waits for that other user to exit, it could take a
long time (e.g. core dumps can take many seconds). If it exits
immediately, then it will leave mm->owner pointing to an invalid task.
If it passes ownership to the other task, it might pass it after the
other task had done its mm_update_next_owner() check, which would be
too late.

- assume three users; the owner exits and wants to pass the ownership
to one of the other two users. it searches through the candidates and
finds one of the other users, which is in PF_EXITING, so it skips it.
Just after this it sees that the user count has fallen to two users.
How does it know whether the user that dropped the count was the
PF_EXITING process that it saw previously (in which case it should
keep searching) or the third user that it's not encountered yet (in
which case it's not going to find the other user anywhere in its
search).

>
>  >>  But there is no way to guarantee that, what is the new_owner exec's after we've
>  >>  done the check and assigned. Won't we end up breaking the invariant? How about
>  >>  we have mm_update_new_owner() call in exec_mmap() as well? That way, we can
>  >>  still use owner_lock and keep the invariant.
>  >>
>  >
>  > Oops, I thought that exit_mm() already got called in the execve()
>  > path, but you're right, it doesn't.
>  >
>  > Yes, exit_mmap() should call mm_update_next_owner() after the call to
>  > task_unlock(), i.e. after it's set its new mm.
>  >
>  > So I need to express the invariant more carefully.
>  >
>  > What we need to preserve is that, for every mm at all times, mm->owner
>  > points to a valid task. So either:
>  >
>  > 1) mm->owner->mm == mm AND mm->owner will check to see whether it
>  > needs to pass ownership before it exits or execs.
>  >
>  > OR
>  >
>  > 2) mm->owner is the last user of mm and is about to free mm.
>  >
>  > OR
>  >
>  > 3) mm->owner is currently searching for another user of mm to pass the
>  > ownership to.
>  >
>  > In order to get from state 3 to state 1 safely we have to hold
>  > task_lock(new_owner). Otherwise we can race with an exit or exec in
>  > new_owner, resulting in a process that has already passed the point of
>  > checking current->mm->owner.
>  >
>
>  No.. like you said if we do it after current->mm has changed and is different
>  from mm, then it's safe to find a new owner. I still don't see why we need
>  task_lock(new_owner).

How about the following sequence: A is old owner, B is new owner

A gets to the task_unlock() in exit_mm(): A->mm is now NULL, mm->owner == A
B starts to execve()
A calls mm_update_next_owner()
B gets to the "active_mm = tsk->active_mm" in exec_mmap()
A finds that B->mm == mm
B continues through the critical section, gets past the point where it
needs to check for ownership
A sets mm->owner = B
B finishes its exec, and carries on with its new mmap


> Even if we have task_lock(new_owner), it can still exit or
>  exec later.

Yes, but once we've set mm->owner to the other task and released its
task_lock, the new owner is responsible for handing off the mm to yet
another owner if necessary.

>
>  Why mix task_lock() to protect mm->owner?

We're not protecting mm->owner - we're protecting new_owner->mm

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 18:52             ` Paul Menage
@ 2008-03-29  1:02               ` Balbir Singh
  2008-03-29  5:46               ` Balbir Singh
  1 sibling, 0 replies; 23+ messages in thread
From: Balbir Singh @ 2008-03-29  1:02 UTC (permalink / raw)
  To: Paul Menage
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

Paul Menage wrote:
> Hi Balbir,
> 
> Could you send out the latest version of your patch? I suspect it's
> changed a bit based on on this review and it would be good to make
> sure we're both on the same page.

Sure, let me rework that and send it across.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [-mm] Add an owner to the mm_struct (v2)
  2008-03-28 18:52             ` Paul Menage
  2008-03-29  1:02               ` Balbir Singh
@ 2008-03-29  5:46               ` Balbir Singh
  1 sibling, 0 replies; 23+ messages in thread
From: Balbir Singh @ 2008-03-29  5:46 UTC (permalink / raw)
  To: Paul Menage
  Cc: Pavel Emelianov, Hugh Dickins, Sudhir Kumar, YAMAMOTO Takashi,
	lizf, linux-kernel, taka, linux-mm, David Rientjes,
	Andrew Morton, KAMEZAWA Hiroyuki

Paul Menage wrote:
> Hi Balbir,
> 
> Could you send out the latest version of your patch? I suspect it's
> changed a bit based on on this review and it would be good to make
> sure we're both on the same page.
> 
> On Fri, Mar 28, 2008 at 11:10 AM, Balbir Singh
> <balbir@linux.vnet.ibm.com> wrote:
>>  At fork time, we can have do_fork() run in parallel and we need to protect
>>  mm->owner, if several threads are created at the same time. We don't want to
>>  overwrite mm->owner for each thread that is created.
> 
> Why would you want to overwrite mm->owner for any of the threads? If
> they're sharing an existing mm, then that mm must already have an
> owner, so no need to update it.
> 

Yes, that's why we have the lock and check.

>>  > No, I think we need to call it later - after we've cleared current->mm
>>  > (from within task_lock(current)) - so we can't rely on p->mm in this
>>  > function, we have to pass it in. If we call it before while
>>  > current->mm == mm, then we risk a race where the (new or existing)
>>  > owner exits and passes it back to us *after* we've done a check to see
>>  > if we need to find a new owner. If we ensure that current->mm != mm
>>  > before we call mm_update_next_owner(), then we know we're not a
>>  > candidate for receiving the ownership if we don't have it already.
>>  >
>>
>>  Yes and we could also check for flags & PF_EXITING
>>
> 
> A couple of issues with that:
> 
> - I'm not sure how that handles the exec case
> 
> - assume two users; the owner exits and wants to pass the ownership to
> the other user. It finds it, but sees that it's PF_EXITING. What
> should it do? If it waits for that other user to exit, it could take a
> long time (e.g. core dumps can take many seconds). If it exits
> immediately, then it will leave mm->owner pointing to an invalid task.
> If it passes ownership to the other task, it might pass it after the
> other task had done its mm_update_next_owner() check, which would be
> too late.
> 
> - assume three users; the owner exits and wants to pass the ownership
> to one of the other two users. it searches through the candidates and
> finds one of the other users, which is in PF_EXITING, so it skips it.
> Just after this it sees that the user count has fallen to two users.
> How does it know whether the user that dropped the count was the
> PF_EXITING process that it saw previously (in which case it should
> keep searching) or the third user that it's not encountered yet (in
> which case it's not going to find the other user anywhere in its
> search).
> 
>>  >>  But there is no way to guarantee that, what is the new_owner exec's after we've
>>  >>  done the check and assigned. Won't we end up breaking the invariant? How about
>>  >>  we have mm_update_new_owner() call in exec_mmap() as well? That way, we can
>>  >>  still use owner_lock and keep the invariant.
>>  >>
>>  >
>>  > Oops, I thought that exit_mm() already got called in the execve()
>>  > path, but you're right, it doesn't.
>>  >
>>  > Yes, exit_mmap() should call mm_update_next_owner() after the call to
>>  > task_unlock(), i.e. after it's set its new mm.
>>  >
>>  > So I need to express the invariant more carefully.
>>  >
>>  > What we need to preserve is that, for every mm at all times, mm->owner
>>  > points to a valid task. So either:
>>  >
>>  > 1) mm->owner->mm == mm AND mm->owner will check to see whether it
>>  > needs to pass ownership before it exits or execs.
>>  >
>>  > OR
>>  >
>>  > 2) mm->owner is the last user of mm and is about to free mm.
>>  >
>>  > OR
>>  >
>>  > 3) mm->owner is currently searching for another user of mm to pass the
>>  > ownership to.
>>  >
>>  > In order to get from state 3 to state 1 safely we have to hold
>>  > task_lock(new_owner). Otherwise we can race with an exit or exec in
>>  > new_owner, resulting in a process that has already passed the point of
>>  > checking current->mm->owner.
>>  >
>>
>>  No.. like you said if we do it after current->mm has changed and is different
>>  from mm, then it's safe to find a new owner. I still don't see why we need
>>  task_lock(new_owner).
> 
> How about the following sequence: A is old owner, B is new owner
> 
> A gets to the task_unlock() in exit_mm(): A->mm is now NULL, mm->owner == A
> B starts to execve()
> A calls mm_update_next_owner()
> B gets to the "active_mm = tsk->active_mm" in exec_mmap()
> A finds that B->mm == mm
> B continues through the critical section, gets past the point where it
> needs to check for ownership
> A sets mm->owner = B
> B finishes its exec, and carries on with its new mmap
> 

OK, a task changing the mm from underneath us can be a problem. Let me move over
to using task_lock(). I wish there was a simpler solution for implementing
mm->owner, but the fact that CLONE_THREAD and CLONE_VM can be called
independently is a huge bottleneck.

> 
>> Even if we have task_lock(new_owner), it can still exit or
>>  exec later.
> 
> Yes, but once we've set mm->owner to the other task and released its
> task_lock, the new owner is responsible for handing off the mm to yet
> another owner if necessary.
> 
>>  Why mix task_lock() to protect mm->owner?
> 
> We're not protecting mm->owner - we're protecting new_owner->mm
>

Yes


> Paul

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2008-03-29  5:50 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-28  8:23 [-mm] Add an owner to the mm_struct (v2) Balbir Singh
2008-03-28  9:41 ` Jiri Slaby
2008-03-28  9:43   ` Jiri Slaby
2008-03-28 10:11     ` Balbir Singh
2008-03-28 10:48 ` KAMEZAWA Hiroyuki
2008-03-28 10:51   ` Balbir Singh
2008-03-28 11:06     ` KAMEZAWA Hiroyuki
2008-03-28 10:55 ` KAMEZAWA Hiroyuki
2008-03-28 10:52   ` Balbir Singh
2008-03-28 11:04     ` Paul Menage
2008-03-28 11:15     ` KAMEZAWA Hiroyuki
2008-03-28 11:21       ` KAMEZAWA Hiroyuki
2008-03-28 11:01 ` Paul Menage
2008-03-28 12:36   ` Balbir Singh
2008-03-28 12:54     ` Balbir Singh
2008-03-28 14:06       ` Paul Menage
2008-03-28 14:05     ` Paul Menage
2008-03-28 14:52       ` Balbir Singh
2008-03-28 15:38         ` Paul Menage
2008-03-28 18:10           ` Balbir Singh
2008-03-28 18:52             ` Paul Menage
2008-03-29  1:02               ` Balbir Singh
2008-03-29  5:46               ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox