* [0/2] memrlimit improve error handling
@ 2008-06-20 15:01 Balbir Singh
2008-06-20 15:01 ` [1/2] memrlimit handle attach_task() failure, add can_attach() callback Balbir Singh
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Balbir Singh @ 2008-06-20 15:01 UTC (permalink / raw)
To: Andrew Morton
Cc: YAMAMOTO Takashi, Paul Menage, linux-kernel, linux-mm,
Balbir Singh, KAMEZAWA Hiroyuki
memrlimit cgroup does not handle error cases after may_expand_vm(). This
BUG was reported by Kamezawa, with the test case below to reproduce it
[root@iridium kamezawa]# cat /opt/cgroup/test/memrlimit.usage_in_bytes
71921664
[root@iridium kamezawa]# ulimit -s 3
[root@iridium kamezawa]# ls
Killed
[root@iridium kamezawa]# ls
Killed
[root@iridium kamezawa]# ls
Killed
[root@iridium kamezawa]# ls
Killed
[root@iridium kamezawa]# ls
Killed
[root@iridium kamezawa]# ulimit -s unlimited
[root@iridium kamezawa]# cat /opt/cgroup/test/memrlimit.usage_in_bytes
72368128
[root@iridium kamezawa]#
This patch adds better handling support to fix the reported problem.
Reported-By: kamezawa.hiroyu@jp.fujitsu.com
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
mm/mmap.c | 36 +++++++++++++++++++++++++-----------
mm/mremap.c | 6 ++++++
2 files changed, 31 insertions(+), 11 deletions(-)
diff -puN mm/mmap.c~memrlimit-cgroup-add-better-error-handling mm/mmap.c
--- linux-2.6.26-rc5/mm/mmap.c~memrlimit-cgroup-add-better-error-handling 2008-06-19 21:12:46.000000000 +0530
+++ linux-2.6.26-rc5-balbir/mm/mmap.c 2008-06-19 21:39:45.000000000 +0530
@@ -1123,7 +1123,7 @@ munmap_back:
*/
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory(charged))
- return -ENOMEM;
+ goto undo_charge;
vm_flags |= VM_ACCOUNT;
}
}
@@ -1245,6 +1245,8 @@ free_vma:
unacct_error:
if (charged)
vm_unacct_memory(charged);
+undo_charge:
+ memrlimit_cgroup_uncharge_as(mm, len >> PAGE_SHIFT);
return error;
}
@@ -1540,14 +1542,15 @@ static int acct_stack_growth(struct vm_a
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
unsigned long new_start;
+ int ret = -ENOMEM;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
- return -ENOMEM;
+ goto out;
/* Stack limit test */
if (size > rlim[RLIMIT_STACK].rlim_cur)
- return -ENOMEM;
+ goto undo_charge;
/* mlock limit tests */
if (vma->vm_flags & VM_LOCKED) {
@@ -1556,21 +1559,23 @@ static int acct_stack_growth(struct vm_a
locked = mm->locked_vm + grow;
limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
- return -ENOMEM;
+ goto undo_charge;
}
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
vma->vm_end - size;
- if (is_hugepage_only_range(vma->vm_mm, new_start, size))
- return -EFAULT;
+ if (is_hugepage_only_range(vma->vm_mm, new_start, size)) {
+ ret = -EFAULT;
+ goto undo_charge;
+ }
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
if (security_vm_enough_memory(grow))
- return -ENOMEM;
+ goto undo_charge;
/* Ok, everything looks good - let it rip */
mm->total_vm += grow;
@@ -1578,6 +1583,11 @@ static int acct_stack_growth(struct vm_a
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
+undo_charge:
+ /* Undo memrlimit charge */
+ memrlimit_cgroup_uncharge_as(mm, grow);
+out:
+ return ret;
}
#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
@@ -1982,6 +1992,7 @@ unsigned long do_brk(unsigned long addr,
struct rb_node ** rb_link, * rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
+ int ret = -ENOMEM;
len = PAGE_ALIGN(len);
if (!len)
@@ -2035,13 +2046,13 @@ unsigned long do_brk(unsigned long addr,
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
+ return ret;
if (mm->map_count > sysctl_max_map_count)
- return -ENOMEM;
+ goto undo_charge;
if (security_vm_enough_memory(len >> PAGE_SHIFT))
- return -ENOMEM;
+ goto undo_charge;
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
@@ -2055,7 +2066,7 @@ unsigned long do_brk(unsigned long addr,
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ goto undo_charge;
}
vma->vm_mm = mm;
@@ -2073,6 +2084,9 @@ out:
mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
}
return addr;
+undo_charge:
+ memrlimit_cgroup_uncharge_as(mm, len >> PAGE_SHIFT);
+ return ret;
}
EXPORT_SYMBOL(do_brk);
diff -puN mm/mremap.c~memrlimit-cgroup-add-better-error-handling mm/mremap.c
--- linux-2.6.26-rc5/mm/mremap.c~memrlimit-cgroup-add-better-error-handling 2008-06-19 21:12:46.000000000 +0530
+++ linux-2.6.26-rc5-balbir/mm/mremap.c 2008-06-19 22:00:02.000000000 +0530
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/memrlimitcgroup.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -256,6 +257,7 @@ unsigned long do_mremap(unsigned long ad
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
+ int vm_expanded = 0;
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
goto out;
@@ -349,6 +351,7 @@ unsigned long do_mremap(unsigned long ad
goto out;
}
+ vm_expanded = 1;
if (vma->vm_flags & VM_ACCOUNT) {
charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory(charged))
@@ -411,6 +414,9 @@ out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
out_nc:
+ if (vm_expanded)
+ memrlimit_cgroup_uncharge_as(mm,
+ (new_len - old_len) >> PAGE_SHIFT);
return ret;
}
_
--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* [1/2] memrlimit handle attach_task() failure, add can_attach() callback
2008-06-20 15:01 [0/2] memrlimit improve error handling Balbir Singh
@ 2008-06-20 15:01 ` Balbir Singh
2008-06-25 23:37 ` Andrew Morton
2008-06-20 15:01 ` [2/2] memrlimit fix usage of tmp as a parameter name Balbir Singh
2008-06-24 23:11 ` [0/2] memrlimit improve error handling Hugh Dickins
2 siblings, 1 reply; 9+ messages in thread
From: Balbir Singh @ 2008-06-20 15:01 UTC (permalink / raw)
To: Andrew Morton
Cc: YAMAMOTO Takashi, Paul Menage, linux-kernel, linux-mm,
Balbir Singh, KAMEZAWA Hiroyuki
This patch fixes a task migration problem reported by Kamezawa-San. This
patch should fix all issues with migraiton, except for a rare condition
documented in memrlimit_cgroup_move_task(). To fix that problem, we
would need to add transaction properties to cgroups.
The problem reported was that migrating to a group that did not have
sufficient limits to accept an incoming task caused a kernel warning.
Steps to reproduce
% mkdir /dev/cgroup/memrlimit/group_01
% mkdir /dev/cgroup/memrlimit/group_02
% echo 1G > /dev/cgroup/memrlimit/group_01/memrlimit.limit_in_bytes
% echo 0 > /dev/cgroup/memrlimit/group_02/memrlimit.limit_in_bytes
% echo $$ > /dev/cgroup/memrlimit/group_01/tasks
% echo $$ > /dev/cgroup/memrlimit/group_02/tasks
% exit
memrlimit does the right thing by not moving the charges to group_02,
but the task is still put into g2 (since we did not use can_attach to
fail migration). Once in g2, when we echo the task to the root cgroup,
it tries to uncharge the cost of the task from g2. g2 does not have
any charge associated with the task, hence we get a warning.
Reported-by: kamezawa.hiroyu@jp.fujitsu.com
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
include/linux/res_counter.h | 18 ++++++++++++++++++
mm/memrlimitcgroup.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 62 insertions(+)
diff -puN mm/memrlimitcgroup.c~memrlimit-cgroup-fix-attach-task mm/memrlimitcgroup.c
--- linux-2.6.26-rc5/mm/memrlimitcgroup.c~memrlimit-cgroup-fix-attach-task 2008-06-19 22:17:46.000000000 +0530
+++ linux-2.6.26-rc5-balbir/mm/memrlimitcgroup.c 2008-06-20 19:53:19.000000000 +0530
@@ -166,6 +166,39 @@ static int memrlimit_cgroup_populate(str
ARRAY_SIZE(memrlimit_cgroup_files));
}
+static int memrlimit_cgroup_can_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cgrp,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct memrlimit_cgroup *memrcg;
+ int ret = 0;
+
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return -EINVAL;
+
+ /*
+ * Hold mmap_sem, so that total_vm does not change underneath us
+ */
+ down_read(&mm->mmap_sem);
+
+ rcu_read_lock();
+ if (p != rcu_dereference(mm->owner))
+ goto out;
+
+ memrcg = memrlimit_cgroup_from_cgrp(cgrp);
+
+ if (!res_counter_add_check(&memrcg->as_res,
+ (mm->total_vm << PAGE_SHIFT)))
+ ret = -ENOMEM;
+out:
+ rcu_read_unlock();
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return ret;
+}
+
static void memrlimit_cgroup_move_task(struct cgroup_subsys *ss,
struct cgroup *cgrp,
struct cgroup *old_cgrp,
@@ -193,6 +226,16 @@ static void memrlimit_cgroup_move_task(s
if (memrcg == old_memrcg)
goto out;
+ /*
+ * TBD: Even though we do the necessary checks in can_attach(),
+ * by the time we come here, there is a chance that we still
+ * fail (the memrlimit cgroup has grown its usage, and the
+ * addition of total_vm will no longer fit into its limit)
+ *
+ * We need transactional support in cgroups to let us know
+ * if can_attach() has failed and call attach_failed() on
+ * cgroups for which can_attach() succeeded.
+ */
if (res_counter_charge(&memrcg->as_res, (mm->total_vm << PAGE_SHIFT)))
goto out;
res_counter_uncharge(&old_memrcg->as_res, (mm->total_vm << PAGE_SHIFT));
@@ -231,6 +274,7 @@ struct cgroup_subsys memrlimit_cgroup_su
.destroy = memrlimit_cgroup_destroy,
.populate = memrlimit_cgroup_populate,
.attach = memrlimit_cgroup_move_task,
+ .can_attach = memrlimit_cgroup_can_move_task,
.mm_owner_changed = memrlimit_cgroup_mm_owner_changed,
.early_init = 0,
};
diff -puN kernel/res_counter.c~memrlimit-cgroup-fix-attach-task kernel/res_counter.c
diff -puN include/linux/res_counter.h~memrlimit-cgroup-fix-attach-task include/linux/res_counter.h
--- linux-2.6.26-rc5/include/linux/res_counter.h~memrlimit-cgroup-fix-attach-task 2008-06-19 22:52:17.000000000 +0530
+++ linux-2.6.26-rc5-balbir/include/linux/res_counter.h 2008-06-20 19:52:10.000000000 +0530
@@ -153,4 +153,22 @@ static inline void res_counter_reset_fai
cnt->failcnt = 0;
spin_unlock_irqrestore(&cnt->lock, flags);
}
+
+/*
+ * Add the value val to the resource counter and check if we are
+ * still under the limit.
+ */
+static inline bool res_counter_add_check(struct res_counter *cnt,
+ unsigned long val)
+{
+ bool ret = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cnt->lock, flags);
+ if (cnt->usage + val <= cnt->limit)
+ ret = true;
+ spin_unlock_irqrestore(&cnt->lock, flags);
+ return ret;
+}
+
#endif
_
--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [1/2] memrlimit handle attach_task() failure, add can_attach() callback
2008-06-20 15:01 ` [1/2] memrlimit handle attach_task() failure, add can_attach() callback Balbir Singh
@ 2008-06-25 23:37 ` Andrew Morton
2008-06-25 23:52 ` Balbir Singh
0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2008-06-25 23:37 UTC (permalink / raw)
To: Balbir Singh; +Cc: yamamoto, menage, linux-kernel, linux-mm, kamezawa.hiroyu
On Fri, 20 Jun 2008 20:31:42 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> +/*
> + * Add the value val to the resource counter and check if we are
> + * still under the limit.
> + */
> +static inline bool res_counter_add_check(struct res_counter *cnt,
> + unsigned long val)
> +{
> + bool ret = false;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&cnt->lock, flags);
> + if (cnt->usage + val <= cnt->limit)
> + ret = true;
> + spin_unlock_irqrestore(&cnt->lock, flags);
> + return ret;
> +}
The comment and the function name imply that thins function will "Add
the value val to the resource counter". But it doesn't do that at all.
In fact the first arg could be a `const struct res_counter *'.
Perhaps res_counter_can_add() would be more accurate.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [1/2] memrlimit handle attach_task() failure, add can_attach() callback
2008-06-25 23:37 ` Andrew Morton
@ 2008-06-25 23:52 ` Balbir Singh
0 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2008-06-25 23:52 UTC (permalink / raw)
To: Andrew Morton; +Cc: yamamoto, menage, linux-kernel, linux-mm, kamezawa.hiroyu
Andrew Morton wrote:
> On Fri, 20 Jun 2008 20:31:42 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>> +/*
>> + * Add the value val to the resource counter and check if we are
>> + * still under the limit.
>> + */
>> +static inline bool res_counter_add_check(struct res_counter *cnt,
>> + unsigned long val)
>> +{
>> + bool ret = false;
>> + unsigned long flags;
>> +
>> + spin_lock_irqsave(&cnt->lock, flags);
>> + if (cnt->usage + val <= cnt->limit)
>> + ret = true;
>> + spin_unlock_irqrestore(&cnt->lock, flags);
>> + return ret;
>> +}
>
> The comment and the function name imply that thins function will "Add
> the value val to the resource counter". But it doesn't do that at all.
> In fact the first arg could be a `const struct res_counter *'.
>
> Perhaps res_counter_can_add() would be more accurate.
Will fix both problems and send out fixes. I intended to call it
res_counter_check_and_add(), but I don't like "and" in function names.
res_counter_can_add is definitely better.
--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [2/2] memrlimit fix usage of tmp as a parameter name
2008-06-20 15:01 [0/2] memrlimit improve error handling Balbir Singh
2008-06-20 15:01 ` [1/2] memrlimit handle attach_task() failure, add can_attach() callback Balbir Singh
@ 2008-06-20 15:01 ` Balbir Singh
2008-06-25 23:41 ` Andrew Morton
2008-06-24 23:11 ` [0/2] memrlimit improve error handling Hugh Dickins
2 siblings, 1 reply; 9+ messages in thread
From: Balbir Singh @ 2008-06-20 15:01 UTC (permalink / raw)
To: Andrew Morton
Cc: YAMAMOTO Takashi, Paul Menage, linux-kernel, linux-mm,
Balbir Singh, KAMEZAWA Hiroyuki
Fix the variable tmp being used in write_strategy. This patch replaces tmp
with val, the fact that it is an output parameter can be interpreted from
the pass by reference.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
mm/memrlimitcgroup.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff -puN mm/memrlimitcgroup.c~memrlimit-cgroup-simple-cleanup mm/memrlimitcgroup.c
--- linux-2.6.26-rc5/mm/memrlimitcgroup.c~memrlimit-cgroup-simple-cleanup 2008-06-20 20:14:00.000000000 +0530
+++ linux-2.6.26-rc5-balbir/mm/memrlimitcgroup.c 2008-06-20 20:22:08.000000000 +0530
@@ -118,13 +118,13 @@ static u64 memrlimit_cgroup_read(struct
cft->private);
}
-static int memrlimit_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+static int memrlimit_cgroup_write_strategy(char *buf, unsigned long long *val)
{
- *tmp = memparse(buf, &buf);
+ *val = memparse(buf, &buf);
if (*buf != '\0')
return -EINVAL;
- *tmp = PAGE_ALIGN(*tmp);
+ *val = PAGE_ALIGN(*val);
return 0;
}
_
--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [2/2] memrlimit fix usage of tmp as a parameter name
2008-06-20 15:01 ` [2/2] memrlimit fix usage of tmp as a parameter name Balbir Singh
@ 2008-06-25 23:41 ` Andrew Morton
2008-06-25 23:51 ` Balbir Singh
0 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2008-06-25 23:41 UTC (permalink / raw)
To: Balbir Singh; +Cc: yamamoto, menage, linux-kernel, linux-mm, kamezawa.hiroyu
On Fri, 20 Jun 2008 20:31:52 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> Fix the variable tmp being used in write_strategy. This patch replaces tmp
> with val, the fact that it is an output parameter can be interpreted from
> the pass by reference.
Paul's "CGroup Files: Convert res_counter_write() to be a cgroups
write_string() handler"
(memrlimit-setup-the-memrlimit-controller-cgroup-files-convert-res_counter_write-to-be-a-cgroups-write_string-handler-memrlimitcgroup.patch)
deleted memrlimit_cgroup_write_strategy(), so problem solved ;)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [2/2] memrlimit fix usage of tmp as a parameter name
2008-06-25 23:41 ` Andrew Morton
@ 2008-06-25 23:51 ` Balbir Singh
0 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2008-06-25 23:51 UTC (permalink / raw)
To: Andrew Morton; +Cc: yamamoto, menage, linux-kernel, linux-mm, kamezawa.hiroyu
Andrew Morton wrote:
> On Fri, 20 Jun 2008 20:31:52 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>
>> Fix the variable tmp being used in write_strategy. This patch replaces tmp
>> with val, the fact that it is an output parameter can be interpreted from
>> the pass by reference.
>
> Paul's "CGroup Files: Convert res_counter_write() to be a cgroups
> write_string() handler"
> (memrlimit-setup-the-memrlimit-controller-cgroup-files-convert-res_counter_write-to-be-a-cgroups-write_string-handler-memrlimitcgroup.patch)
> deleted memrlimit_cgroup_write_strategy(), so problem solved ;)
Yes, I remember reviewing those patches. Nice to have problems solved
automatically :)
--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [0/2] memrlimit improve error handling
2008-06-20 15:01 [0/2] memrlimit improve error handling Balbir Singh
2008-06-20 15:01 ` [1/2] memrlimit handle attach_task() failure, add can_attach() callback Balbir Singh
2008-06-20 15:01 ` [2/2] memrlimit fix usage of tmp as a parameter name Balbir Singh
@ 2008-06-24 23:11 ` Hugh Dickins
2008-06-25 1:01 ` Balbir Singh
2 siblings, 1 reply; 9+ messages in thread
From: Hugh Dickins @ 2008-06-24 23:11 UTC (permalink / raw)
To: Balbir Singh
Cc: Andrew Morton, YAMAMOTO Takashi, Paul Menage, linux-kernel,
linux-mm, KAMEZAWA Hiroyuki
Hi Balbir,
Below I've a few comments on this particular patch, then report other
problems I've seen with memrlimits configured into my 2.6.25-rc5-mm3
kernel - I've not tried _using_ them. My own opinion would be that
-mm already contains enough breakage, we don't need memrlimits yet.
On Fri, 20 Jun 2008, Balbir Singh wrote:
>
> memrlimit cgroup does not handle error cases after may_expand_vm(). This
> BUG was reported by Kamezawa, with the test case below to reproduce it
>
> This patch adds better handling support to fix the reported problem.
>
> Reported-By: kamezawa.hiroyu@jp.fujitsu.com
>
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
I wrote a similar patch when I saw Kame's report, because I was
interested in getting that accounting right. Comparing the two,
a couple of notes on the details...
In mm/mmap.c:
> @@ -1982,6 +1992,7 @@ unsigned long do_brk(unsigned long addr,
> struct rb_node ** rb_link, * rb_parent;
> pgoff_t pgoff = addr >> PAGE_SHIFT;
> int error;
> + int ret = -ENOMEM;
I think we don't want int error returned from some parts of do_brk()
and int ret returned from other parts: please use int error throughout.
It would probably be nicer to add int error rather than int ret in
acct_stack_growth() too, but that doesn't matter much.
In mm/mremap.c:
> @@ -256,6 +257,7 @@ unsigned long do_mremap(unsigned long ad
> struct vm_area_struct *vma;
> unsigned long ret = -EINVAL;
> unsigned long charged = 0;
> + int vm_expanded = 0;
>
> if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> goto out;
> @@ -349,6 +351,7 @@ unsigned long do_mremap(unsigned long ad
> goto out;
> }
>
> + vm_expanded = 1;
> if (vma->vm_flags & VM_ACCOUNT) {
> charged = (new_len - old_len) >> PAGE_SHIFT;
> if (security_vm_enough_memory(charged))
> @@ -411,6 +414,9 @@ out:
> if (ret & ~PAGE_MASK)
> vm_unacct_memory(charged);
> out_nc:
> + if (vm_expanded)
> + memrlimit_cgroup_uncharge_as(mm,
> + (new_len - old_len) >> PAGE_SHIFT);
> return ret;
> }
See how vm_unacct_memory(charged) is only called if (ret & ~PAGE_MASK)?
If ret is a valid address being returned, we do not want to uncharge.
So I believe you need to do likewise with your uncharge_as().
And please handle them both in the same way: either follow the same
"charged" style as is being used for vm_unacct_memory, rather than a
boolean; or convert vm_unacct_memory over to use your boolean style:
but it's unhelpful to have the two using different techniques.
In kernel/fork.c:
nothing, but when I had a quick look there, again the error handling
appeared to be broken e.g. if allocate_mm fails, where's the uncharge?
But be careful: there's a particular point where enough of the new mm
is set up that it gets torn down by normal exit_mmap.
And while looking at copy_mm, do I see an extra down_write and
up_write of mmap_sem, just to guard some memrlimit charging?
Don't add overhead when memrlimits are CONFIGed off; but can't
it be moved into dup_mm() where mmap_sem is already down_write?
Please go through all your charges, again, to double check
that you've got the uncharging right in the error cases.
I was interested in these because I find that after running kernel
build on tmpfs swapping loads in a 700M memcg (but you may well hit
other rc5-mm3 bugs if you try that, I've fixes to send out) for some
hours on x86_64, my shutdown hits the kernel/res_counter.c:49
res_counter_uncharge_locked() WARN_ON(counter->usage < val) several
times, called from memrlimit_cgroup_uncharge_as called from exit_mmap.
I have no idea what the cause is; but I've not seen it on i386,
and I'm not seeing it after shorter runs. It could even be some
error introduced by other patches in what I'm testing: so don't
worry too much about it yet, but please keep an eye out for it.
(If I'd thought harder, I'd have been less interested in the
charge_as leaks: those WARN_ONs come when too much is being
uncharged, not when too little.)
But the issue which worries me most, because it's mislocking
which affects anyone with CONFIG_MM_OWNER=y, is seen when I
have CONFIG_DEBUG_SPINLOCK_SLEEP=y:
BUG: sleeping function called from invalid context at kernel/rwsem.c:48
in_atomic():1, irqs_disabled():0
1 lock held by blogd/830:
#0: (tasklist_lock){..--}, at: [<78125869>] mm_update_next_owner+0x1dd/0x1fa
Pid: 830, comm: blogd Not tainted 2.6.26-rc5-mm2 #2
[<78120ea2>] __might_sleep+0xed/0xf2
[<78367947>] down_write+0x13/0x43
[<781257b6>] mm_update_next_owner+0x12a/0x1fa
[<7812595a>] exit_mm+0xd4/0xe5
[<78125f9d>] do_exit+0x1e7/0x2bc
[<781260fb>] do_group_exit+0x61/0x8a
[<78126134>] sys_exit_group+0x10/0x13
[<78102dd9>] sysenter_past_esp+0x6a/0xa5
Your memrlimit-cgroup-mm-owner-callback-changes-to-add-task-info.patch
thinks it can acquire mmap_sem while it has read_lock(&tasklist_lock).
Sorry, no: you'll have to rework that somehow.
(In passing, I'll add that I'm not a great fan of these memrlimits:
to me it's loony to be charging people for virtual address space,
it's _virtual_, and process A can have as much as it likes without
affecting process B in any way. You're following the lead of RLIMIT_AS,
but I've always thought RLIMIT_AS a lame attempt to move into the mmap
decade, after RLIMIT_DATA and RLIMIT_STACK no longer made sense.
Taking Alan Cox's Committed_AS as a limited resource charged per mm makes
much more sense to me: but yes, it's not perfect, and it is a lot harder
to get its accounting right, and to maintain that down the line. Okay,
you've gone for the easier option of tracking total_vm, getting that
right is a more achievable target. And I accept that I may be too
pessimistic about it: total_vm may often enough give a rough
approximation to something else worth limiting.)
Hugh
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [0/2] memrlimit improve error handling
2008-06-24 23:11 ` [0/2] memrlimit improve error handling Hugh Dickins
@ 2008-06-25 1:01 ` Balbir Singh
0 siblings, 0 replies; 9+ messages in thread
From: Balbir Singh @ 2008-06-25 1:01 UTC (permalink / raw)
To: Hugh Dickins
Cc: Andrew Morton, YAMAMOTO Takashi, Paul Menage, linux-kernel,
linux-mm, KAMEZAWA Hiroyuki
Hugh Dickins wrote:
> Hi Balbir,
>
> Below I've a few comments on this particular patch, then report other
> problems I've seen with memrlimits configured into my 2.6.25-rc5-mm3
> kernel - I've not tried _using_ them. My own opinion would be that
> -mm already contains enough breakage, we don't need memrlimits yet.
>
Thanks for the review and feedback, I'll take a look and fix
> On Fri, 20 Jun 2008, Balbir Singh wrote:
>> memrlimit cgroup does not handle error cases after may_expand_vm(). This
>> BUG was reported by Kamezawa, with the test case below to reproduce it
>>
>> This patch adds better handling support to fix the reported problem.
>>
>> Reported-By: kamezawa.hiroyu@jp.fujitsu.com
>>
>> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
>
> I wrote a similar patch when I saw Kame's report, because I was
> interested in getting that accounting right. Comparing the two,
> a couple of notes on the details...
>
> In mm/mmap.c:
>> @@ -1982,6 +1992,7 @@ unsigned long do_brk(unsigned long addr,
>> struct rb_node ** rb_link, * rb_parent;
>> pgoff_t pgoff = addr >> PAGE_SHIFT;
>> int error;
>> + int ret = -ENOMEM;
>
> I think we don't want int error returned from some parts of do_brk()
> and int ret returned from other parts: please use int error throughout.
> It would probably be nicer to add int error rather than int ret in
> acct_stack_growth() too, but that doesn't matter much.
>
Fair enough, will fix
> In mm/mremap.c:
>> @@ -256,6 +257,7 @@ unsigned long do_mremap(unsigned long ad
>> struct vm_area_struct *vma;
>> unsigned long ret = -EINVAL;
>> unsigned long charged = 0;
>> + int vm_expanded = 0;
>>
>> if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
>> goto out;
>> @@ -349,6 +351,7 @@ unsigned long do_mremap(unsigned long ad
>> goto out;
>> }
>>
>> + vm_expanded = 1;
>> if (vma->vm_flags & VM_ACCOUNT) {
>> charged = (new_len - old_len) >> PAGE_SHIFT;
>> if (security_vm_enough_memory(charged))
>> @@ -411,6 +414,9 @@ out:
>> if (ret & ~PAGE_MASK)
>> vm_unacct_memory(charged);
>> out_nc:
>> + if (vm_expanded)
>> + memrlimit_cgroup_uncharge_as(mm,
>> + (new_len - old_len) >> PAGE_SHIFT);
>> return ret;
>> }
>
> See how vm_unacct_memory(charged) is only called if (ret & ~PAGE_MASK)?
> If ret is a valid address being returned, we do not want to uncharge.
> So I believe you need to do likewise with your uncharge_as().
>
> And please handle them both in the same way: either follow the same
> "charged" style as is being used for vm_unacct_memory, rather than a
> boolean; or convert vm_unacct_memory over to use your boolean style:
> but it's unhelpful to have the two using different techniques.
>
I did look at that and use that code, but we initialize ret = -EINVAL and then
we have the checks for
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
goto out;
if (addr & ~PAGE_MASK)
goto out;
In out, we check for
ret & ~PAGE_MASK
That's OK for vm_unacct_memory, since charged is set to 0, but not for
mem_cgroup_uncharge_as as we uncharge (new_len - old_len) >> PAGE_SHIFT);
I'll find a way to see if we can convert both over.
> In kernel/fork.c:
> nothing, but when I had a quick look there, again the error handling
> appeared to be broken e.g. if allocate_mm fails, where's the uncharge?
> But be careful: there's a particular point where enough of the new mm
> is set up that it gets torn down by normal exit_mmap.
>
Yes, that needs fixing
> And while looking at copy_mm, do I see an extra down_write and
> up_write of mmap_sem, just to guard some memrlimit charging?
Yes, we need to prevent charging and migration race.
> Don't add overhead when memrlimits are CONFIGed off; but can't
> it be moved into dup_mm() where mmap_sem is already down_write?
>
Yes, it can be moved there
> Please go through all your charges, again, to double check
> that you've got the uncharging right in the error cases.
>
Will do
> I was interested in these because I find that after running kernel
> build on tmpfs swapping loads in a 700M memcg (but you may well hit
> other rc5-mm3 bugs if you try that, I've fixes to send out) for some
I've never tried that before
> hours on x86_64, my shutdown hits the kernel/res_counter.c:49
> res_counter_uncharge_locked() WARN_ON(counter->usage < val) several
> times, called from memrlimit_cgroup_uncharge_as called from exit_mmap.
>
> I have no idea what the cause is; but I've not seen it on i386,
> and I'm not seeing it after shorter runs. It could even be some
> error introduced by other patches in what I'm testing: so don't
> worry too much about it yet, but please keep an eye out for it.
>
Sure, will do
> (If I'd thought harder, I'd have been less interested in the
> charge_as leaks: those WARN_ONs come when too much is being
> uncharged, not when too little.)
>
> But the issue which worries me most, because it's mislocking
> which affects anyone with CONFIG_MM_OWNER=y, is seen when I
> have CONFIG_DEBUG_SPINLOCK_SLEEP=y:
>
> BUG: sleeping function called from invalid context at kernel/rwsem.c:48
> in_atomic():1, irqs_disabled():0
> 1 lock held by blogd/830:
> #0: (tasklist_lock){..--}, at: [<78125869>] mm_update_next_owner+0x1dd/0x1fa
> Pid: 830, comm: blogd Not tainted 2.6.26-rc5-mm2 #2
> [<78120ea2>] __might_sleep+0xed/0xf2
> [<78367947>] down_write+0x13/0x43
> [<781257b6>] mm_update_next_owner+0x12a/0x1fa
> [<7812595a>] exit_mm+0xd4/0xe5
> [<78125f9d>] do_exit+0x1e7/0x2bc
> [<781260fb>] do_group_exit+0x61/0x8a
> [<78126134>] sys_exit_group+0x10/0x13
> [<78102dd9>] sysenter_past_esp+0x6a/0xa5
>
> Your memrlimit-cgroup-mm-owner-callback-changes-to-add-task-info.patch
> thinks it can acquire mmap_sem while it has read_lock(&tasklist_lock).
> Sorry, no: you'll have to rework that somehow.
>
Yes, that is nasty and a bad miss. It should be easy to fix, but will require
extensive testing.
> (In passing, I'll add that I'm not a great fan of these memrlimits:
> to me it's loony to be charging people for virtual address space,
> it's _virtual_, and process A can have as much as it likes without
> affecting process B in any way. You're following the lead of RLIMIT_AS,
> but I've always thought RLIMIT_AS a lame attempt to move into the mmap
> decade, after RLIMIT_DATA and RLIMIT_STACK no longer made sense.
>
> Taking Alan Cox's Committed_AS as a limited resource charged per mm makes
> much more sense to me: but yes, it's not perfect, and it is a lot harder
> to get its accounting right, and to maintain that down the line. Okay,
> you've gone for the easier option of tracking total_vm, getting that
> right is a more achievable target. And I accept that I may be too
> pessimistic about it: total_vm may often enough give a rough
> approximation to something else worth limiting.)
You seem to have read my mind, my motivation for memrlimits is
1. Administrators to set a limit and be sure that a cgroup cannot consume more
swap + RSS than the assigned virtual memory limit
2. It allows applications to fail gracefully or decide what parts to free up
to get more memory or change their allocation pattern (a scientific application
deciding what size of matrix to allocate for example).
Excellent feedback, thanks for the review!
--
Warm Regards,
Balbir Singh
Linux Technology Center
IBM, ISTL
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2008-06-25 23:52 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-06-20 15:01 [0/2] memrlimit improve error handling Balbir Singh
2008-06-20 15:01 ` [1/2] memrlimit handle attach_task() failure, add can_attach() callback Balbir Singh
2008-06-25 23:37 ` Andrew Morton
2008-06-25 23:52 ` Balbir Singh
2008-06-20 15:01 ` [2/2] memrlimit fix usage of tmp as a parameter name Balbir Singh
2008-06-25 23:41 ` Andrew Morton
2008-06-25 23:51 ` Balbir Singh
2008-06-24 23:11 ` [0/2] memrlimit improve error handling Hugh Dickins
2008-06-25 1:01 ` Balbir Singh
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox