From: Glauber Costa <glommer@parallels.com>
To: linux-kernel@vger.kernel.org
Cc: cgroups@vger.kernel.org, kamezawa.hiroyu@jp.fujitsu.com,
devel@openvz.org, Tejun Heo <tj@kernel.org>,
linux-mm@kvack.org, Suleiman Souhlal <suleiman@google.com>,
Frederic Weisbecker <fweisbec@gmail.com>,
Mel Gorman <mgorman@suse.de>,
David Rientjes <rientjes@google.com>,
Glauber Costa <glommer@parallels.com>,
Christoph Lameter <cl@linux.com>,
Pekka Enberg <penberg@cs.helsinki.fi>,
Michal Hocko <mhocko@suse.cz>,
Johannes Weiner <hannes@cmpxchg.org>
Subject: [PATCH v3 09/13] memcg: kmem accounting lifecycle management
Date: Tue, 18 Sep 2012 18:04:06 +0400 [thread overview]
Message-ID: <1347977050-29476-10-git-send-email-glommer@parallels.com> (raw)
In-Reply-To: <1347977050-29476-1-git-send-email-glommer@parallels.com>
Because the assignment: memcg->kmem_accounted = true is done after the
jump labels increment, we guarantee that the root memcg will always be
selected until all call sites are patched (see memcg_kmem_enabled).
This guarantees that no mischarges are applied.
Jump label decrement happens when the last reference count from the
memcg dies. This will only happen when the caches are all dead.
-> /cgroups/memory/A/B/C
* kmem limit set at A,
* A and B have no tasks,
* span a new task in in C.
Because kmem_accounted is a boolean that was not set for C, no
accounting would be done. This is, however, not what we expect.
The basic idea, is that when a cgroup is limited, we walk the tree
downwards and make sure that we store the information about the parent
being limited in kmem_accounted.
We do the reverse operation when a formerly limited cgroup becomes
unlimited.
Since kmem charges may outlive the cgroup existance, we need to be extra
careful to guarantee the memcg object will stay around for as long as
needed. Up to now, we were using a mem_cgroup_get()/put() pair in charge
and uncharge operations.
Although this guarantees that the object will be around until the last
call to unchage, this means an atomic update in every charge. We can do
better than that if we only issue get() in the first charge, and then
put() when the last charge finally goes away.
[ v3: merged all lifecycle related patches in one ]
Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Christoph Lameter <cl@linux.com>
CC: Pekka Enberg <penberg@cs.helsinki.fi>
CC: Michal Hocko <mhocko@suse.cz>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Suleiman Souhlal <suleiman@google.com>
---
mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 112 insertions(+), 11 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f36a01..720e4bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -287,7 +287,8 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- bool kmem_accounted;
+
+ unsigned long kmem_accounted; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
@@ -340,6 +341,43 @@ struct mem_cgroup {
#endif
};
+enum {
+ KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+ KMEM_ACCOUNTED_PARENT, /* one of its parents is active */
+ KMEM_ACCOUNTED_DEAD, /* dead memcg, pending kmem charges */
+};
+
+/* bits 0 and 1 */
+#define KMEM_ACCOUNTED_MASK 0x3
+
+#ifdef CONFIG_MEMCG_KMEM
+static bool memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+ return !test_and_set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_accounted);
+}
+
+static bool memcg_kmem_is_accounted(struct mem_cgroup *memcg)
+{
+ return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_accounted);
+}
+
+static void memcg_kmem_set_active_parent(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_PARENT, &memcg->kmem_accounted);
+}
+
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+ if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_accounted))
+ set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_accounted);
+}
+
+static bool memcg_kmem_dead(struct mem_cgroup *memcg)
+{
+ return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_accounted);
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -491,7 +529,7 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- memcg->kmem_accounted;
+ (memcg->kmem_accounted & (KMEM_ACCOUNTED_MASK));
}
/*
@@ -524,13 +562,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
if (!memcg_can_account_kmem(memcg))
return true;
- mem_cgroup_get(memcg);
-
ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order) == 0;
if (ret)
*_memcg = memcg;
- else
- mem_cgroup_put(memcg);
return ret;
}
@@ -589,7 +623,6 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
WARN_ON(mem_cgroup_is_root(memcg));
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
- mem_cgroup_put(memcg);
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -4077,6 +4110,40 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
+
+static void memcg_update_kmem_limit(struct mem_cgroup *memcg, u64 val)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ struct mem_cgroup *iter;
+
+ /*
+ * When we are doing hierarchical accounting, with an hierarchy like
+ * A/B/C, we need to start accounting kernel memory all the way up to C
+ * in case A start being accounted.
+ *
+ * So when we the cgroup first gets to be unlimited, we walk all the
+ * children of the current memcg and enable kmem accounting for them.
+ * Note that a separate bit is used there to indicate that the
+ * accounting happens due to the parent being accounted.
+ *
+ * note that memcg_kmem_set_active is a test-and-set routine, so we only
+ * arrive here once (since we never disable it)
+ */
+ mutex_lock(&set_limit_mutex);
+ if ((val != RESOURCE_MAX) && memcg_kmem_set_active(memcg)) {
+
+ mem_cgroup_get(memcg);
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ if (iter == memcg)
+ continue;
+ memcg_kmem_set_active_parent(iter);
+ }
+ }
+ mutex_unlock(&set_limit_mutex);
+#endif
+}
+
/*
* The user of this function is...
* RES_LIMIT.
@@ -4115,9 +4182,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
if (ret)
break;
- /* For simplicity, we won't allow this to be disabled */
- if (!memcg->kmem_accounted && val != RESOURCE_MAX)
- memcg->kmem_accounted = true;
+ memcg_update_kmem_limit(memcg, val);
} else
return -EINVAL;
break;
@@ -4791,6 +4856,20 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
mem_cgroup_sockets_destroy(memcg);
+
+ memcg_kmem_mark_dead(memcg);
+
+ if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+ return;
+
+ /*
+ * Charges already down to 0, undo mem_cgroup_get() done in the charge
+ * path here, being careful not to race with memcg_uncharge_kmem: it is
+ * possible that the charges went down to 0 between mark_dead and the
+ * res_counter read, so in that case, we don't need the put
+ */
+ if (memcg_kmem_dead(memcg))
+ mem_cgroup_put(memcg);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5148,6 +5227,8 @@ mem_cgroup_create(struct cgroup *cont)
}
if (parent && parent->use_hierarchy) {
+ struct mem_cgroup __maybe_unused *p;
+
res_counter_init(&memcg->res, &parent->res);
res_counter_init(&memcg->memsw, &parent->memsw);
res_counter_init(&memcg->kmem, &parent->kmem);
@@ -5158,6 +5239,20 @@ mem_cgroup_create(struct cgroup *cont)
* mem_cgroup(see mem_cgroup_put).
*/
mem_cgroup_get(parent);
+#ifdef CONFIG_MEMCG_KMEM
+ /*
+ * In case a parent is already limited when we create this, we
+ * need him to propagate it now so we become limited as well.
+ */
+ mutex_lock(&set_limit_mutex);
+ for (p = parent; p != NULL; p = parent_mem_cgroup(p)) {
+ if (memcg_kmem_is_accounted(p)) {
+ memcg_kmem_set_active_parent(memcg);
+ break;
+ }
+ }
+ mutex_unlock(&set_limit_mutex);
+#endif
} else {
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
@@ -5871,9 +5966,15 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
if (!memcg)
return;
- res_counter_uncharge(&memcg->kmem, size);
res_counter_uncharge(&memcg->res, size);
if (do_swap_account)
res_counter_uncharge(&memcg->memsw, size);
+
+ /* Not down to 0 */
+ if (res_counter_uncharge(&memcg->kmem, size))
+ return;
+
+ if (memcg_kmem_dead(memcg))
+ mem_cgroup_put(memcg);
}
#endif /* CONFIG_MEMCG_KMEM */
--
1.7.11.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2012-09-18 14:08 UTC|newest]
Thread overview: 127+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-09-18 14:03 [PATCH v3 00/13] kmem controller for memcg Glauber Costa
2012-09-18 14:03 ` [PATCH v3 01/13] memcg: Make it possible to use the stock for more than one page Glauber Costa
2012-10-01 18:48 ` Johannes Weiner
2012-09-18 14:03 ` [PATCH v3 02/13] memcg: Reclaim when more than one page needed Glauber Costa
2012-10-01 19:00 ` Johannes Weiner
2012-09-18 14:04 ` [PATCH v3 03/13] memcg: change defines to an enum Glauber Costa
2012-10-01 19:06 ` Johannes Weiner
2012-10-02 9:10 ` Glauber Costa
2012-09-18 14:04 ` [PATCH v3 04/13] kmem accounting basic infrastructure Glauber Costa
2012-09-21 16:34 ` Tejun Heo
2012-09-24 8:09 ` Glauber Costa
2012-09-26 14:03 ` Michal Hocko
2012-09-26 14:33 ` Glauber Costa
2012-09-26 16:01 ` Michal Hocko
2012-09-26 17:34 ` Glauber Costa
2012-09-26 16:36 ` Tejun Heo
2012-09-26 17:36 ` Glauber Costa
2012-09-26 17:44 ` Tejun Heo
2012-09-26 17:53 ` Glauber Costa
2012-09-26 18:01 ` Tejun Heo
2012-09-26 18:56 ` Glauber Costa
2012-09-26 19:34 ` Tejun Heo
2012-09-26 19:46 ` Glauber Costa
2012-09-26 19:56 ` Tejun Heo
2012-09-26 20:02 ` Glauber Costa
2012-09-26 20:16 ` Tejun Heo
2012-09-26 21:24 ` Glauber Costa
2012-09-26 22:10 ` Tejun Heo
2012-09-26 22:29 ` Glauber Costa
2012-09-26 22:42 ` Tejun Heo
2012-09-26 22:54 ` Glauber Costa
2012-09-26 23:08 ` Tejun Heo
2012-09-26 23:20 ` Glauber Costa
2012-09-26 23:33 ` Tejun Heo
2012-09-27 12:15 ` Michal Hocko
2012-09-27 12:20 ` Glauber Costa
2012-09-27 12:40 ` Michal Hocko
2012-09-27 12:40 ` Glauber Costa
2012-09-27 12:54 ` Michal Hocko
2012-09-27 14:28 ` Mel Gorman
2012-09-27 14:49 ` Tejun Heo
2012-09-27 14:57 ` Glauber Costa
2012-09-27 17:46 ` Tejun Heo
2012-09-27 17:56 ` Michal Hocko
2012-09-27 18:45 ` Glauber Costa
2012-09-30 7:57 ` Tejun Heo
2012-09-30 8:02 ` Tejun Heo
2012-09-30 8:56 ` James Bottomley
2012-09-30 10:37 ` Tejun Heo
2012-09-30 11:25 ` James Bottomley
2012-10-01 0:57 ` Tejun Heo
2012-10-01 8:43 ` Glauber Costa
2012-10-01 8:46 ` Glauber Costa
2012-10-03 22:59 ` Tejun Heo
2012-10-01 8:36 ` Glauber Costa
2012-09-27 12:08 ` Michal Hocko
2012-09-27 12:11 ` Glauber Costa
2012-09-27 14:33 ` Tejun Heo
2012-09-27 14:43 ` Mel Gorman
2012-09-27 14:58 ` Tejun Heo
2012-09-27 18:30 ` Glauber Costa
2012-09-30 8:23 ` Tejun Heo
2012-10-01 8:45 ` Glauber Costa
2012-10-03 22:54 ` Tejun Heo
2012-10-04 11:55 ` Glauber Costa
2012-10-06 2:19 ` Tejun Heo
2012-09-27 15:09 ` Michal Hocko
2012-09-30 8:47 ` Tejun Heo
2012-10-01 9:27 ` Michal Hocko
2012-10-03 22:43 ` Tejun Heo
2012-10-05 13:47 ` Michal Hocko
2012-09-26 22:11 ` Johannes Weiner
2012-09-26 22:45 ` Glauber Costa
2012-09-18 14:04 ` [PATCH v3 05/13] Add a __GFP_KMEMCG flag Glauber Costa
2012-09-18 14:15 ` Rik van Riel
2012-09-18 15:06 ` Christoph Lameter
2012-09-19 7:39 ` Glauber Costa
2012-09-19 14:07 ` Christoph Lameter
2012-09-27 13:34 ` Mel Gorman
2012-09-27 13:41 ` Glauber Costa
2012-10-01 19:09 ` Johannes Weiner
2012-09-18 14:04 ` [PATCH v3 06/13] memcg: kmem controller infrastructure Glauber Costa
2012-09-20 16:05 ` JoonSoo Kim
2012-09-21 8:41 ` Glauber Costa
2012-09-21 9:14 ` JoonSoo Kim
2012-09-26 15:51 ` Michal Hocko
2012-09-27 11:31 ` Glauber Costa
2012-09-27 13:44 ` Michal Hocko
2012-09-28 11:34 ` Glauber Costa
2012-09-30 8:25 ` Tejun Heo
2012-10-01 8:28 ` Glauber Costa
2012-10-03 22:11 ` Tejun Heo
2012-10-01 9:44 ` Michal Hocko
2012-10-01 9:48 ` Michal Hocko
2012-10-01 10:09 ` Glauber Costa
2012-10-01 11:51 ` Michal Hocko
2012-10-01 11:51 ` Glauber Costa
2012-10-01 11:58 ` Michal Hocko
2012-10-01 12:04 ` Glauber Costa
2012-09-18 14:04 ` [PATCH v3 07/13] mm: Allocate kernel pages to the right memcg Glauber Costa
2012-09-27 13:50 ` Mel Gorman
2012-09-28 9:43 ` Glauber Costa
2012-09-28 13:28 ` Mel Gorman
2012-09-27 13:52 ` Michal Hocko
2012-09-18 14:04 ` [PATCH v3 08/13] res_counter: return amount of charges after res_counter_uncharge Glauber Costa
2012-10-01 10:00 ` Michal Hocko
2012-10-01 10:01 ` Glauber Costa
2012-09-18 14:04 ` Glauber Costa [this message]
2012-10-01 12:15 ` [PATCH v3 09/13] memcg: kmem accounting lifecycle management Michal Hocko
2012-10-01 12:29 ` Glauber Costa
2012-10-01 12:36 ` Michal Hocko
2012-10-01 12:43 ` Glauber Costa
2012-09-18 14:04 ` [PATCH v3 10/13] memcg: use static branches when code not in use Glauber Costa
2012-10-01 12:25 ` Michal Hocko
2012-10-01 12:27 ` Glauber Costa
2012-09-18 14:04 ` [PATCH v3 11/13] memcg: allow a memcg with kmem charges to be destructed Glauber Costa
2012-10-01 12:30 ` Michal Hocko
2012-09-18 14:04 ` [PATCH v3 12/13] execute the whole memcg freeing in rcu callback Glauber Costa
2012-09-21 17:23 ` Tejun Heo
2012-09-24 8:48 ` Glauber Costa
2012-10-01 13:27 ` Michal Hocko
2012-10-04 10:53 ` Glauber Costa
2012-10-04 14:20 ` Glauber Costa
2012-10-05 15:31 ` Johannes Weiner
2012-10-08 9:45 ` Glauber Costa
2012-09-18 14:04 ` [PATCH v3 13/13] protect architectures where THREAD_SIZE >= PAGE_SIZE against fork bombs Glauber Costa
2012-10-01 13:17 ` Michal Hocko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1347977050-29476-10-git-send-email-glommer@parallels.com \
--to=glommer@parallels.com \
--cc=cgroups@vger.kernel.org \
--cc=cl@linux.com \
--cc=devel@openvz.org \
--cc=fweisbec@gmail.com \
--cc=hannes@cmpxchg.org \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mgorman@suse.de \
--cc=mhocko@suse.cz \
--cc=penberg@cs.helsinki.fi \
--cc=rientjes@google.com \
--cc=suleiman@google.com \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox