From: Muchun Song <songmuchun@bytedance.com>
To: guro@fb.com, hannes@cmpxchg.org, mhocko@kernel.org,
akpm@linux-foundation.org, shakeelb@google.com,
vdavydov.dev@gmail.com
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
duanxiongchun@bytedance.com,
Muchun Song <songmuchun@bytedance.com>
Subject: [RFC PATCH 06/15] mm: memcontrol: move the objcg infrastructure out of CONFIG_MEMCG_KMEM
Date: Tue, 30 Mar 2021 18:15:22 +0800 [thread overview]
Message-ID: <20210330101531.82752-7-songmuchun@bytedance.com> (raw)
In-Reply-To: <20210330101531.82752-1-songmuchun@bytedance.com>
Because memory allocations pinning memcgs for a long time - it exists
at a larger scale and is causing recurring problems in the real world:
page cache doesn't get reclaimed for a long time, or is used by the
second, third, fourth, ... instance of the same job that was restarted
into a new cgroup every time. Unreclaimable dying cgroups pile up,
waste memory, and make page reclaim very inefficient.
We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always
point to an object cgroup pointer.
Therefore, the infrastructure of objcg no longer only serves
CONFIG_MEMCG_KMEM. In this patch, we move the infrastructure of the
objcg out of the scope of the CONFIG_MEMCG_KMEM so that the LRU pages
can reuse it to charge pages.
We know that the LRU pages are not accounted at the root level. But the
page->memcg_data points to the root_mem_cgroup. So the page->memcg_data
of the LRU pages always points to a valid pointer. But the root_mem_cgroup
dose not have an object cgroup. If we use obj_cgroup APIs to charge the
LRU pages, we should set the page->memcg_data to a root object cgroup. So
we also allocate an object cgroup for the root_mem_cgroup and introduce
root_obj_cgroup to cache its value just like root_mem_cgroup.
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
include/linux/memcontrol.h | 4 ++-
mm/memcontrol.c | 71 +++++++++++++++++++++++++++++++++++++---------
2 files changed, 60 insertions(+), 15 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6e3283828391..463fc7b78396 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -205,7 +205,9 @@ struct memcg_cgwb_frn {
struct obj_cgroup {
struct percpu_ref refcnt;
struct mem_cgroup *memcg;
+#ifdef CONFIG_MEMCG_KMEM
atomic_t nr_charged_bytes;
+#endif
union {
struct list_head list;
struct rcu_head rcu;
@@ -303,9 +305,9 @@ struct mem_cgroup {
#ifdef CONFIG_MEMCG_KMEM
int kmemcg_id;
enum memcg_kmem_state kmem_state;
+#endif
struct obj_cgroup __rcu *objcg;
struct list_head objcg_list; /* list of inherited objcgs */
-#endif
MEMCG_PADDING(_pad2_);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fdabe12e9df0..0107f23e7035 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -75,6 +75,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct obj_cgroup *root_obj_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -252,9 +253,14 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-#ifdef CONFIG_MEMCG_KMEM
extern spinlock_t css_set_lock;
+static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg)
+{
+ return objcg == root_obj_cgroup;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
unsigned int nr_pages);
@@ -298,6 +304,20 @@ static void obj_cgroup_release(struct percpu_ref *ref)
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
}
+#else
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ list_del(&objcg->list);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(objcg, rcu);
+}
+#endif
static struct obj_cgroup *obj_cgroup_alloc(void)
{
@@ -318,10 +338,14 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
return objcg;
}
-static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg, *iter;
+ struct mem_cgroup *parent;
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
@@ -342,6 +366,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
percpu_ref_kill(&objcg->refcnt);
}
+static int memcg_obj_cgroup_alloc(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg;
+
+ objcg = obj_cgroup_alloc();
+ if (!objcg)
+ return -ENOMEM;
+
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ return 0;
+}
+
+static void memcg_obj_cgroup_free(struct mem_cgroup *memcg)
+{
+ if (unlikely(memcg->objcg))
+ memcg_reparent_objcgs(memcg);
+}
+
+#ifdef CONFIG_MEMCG_KMEM
/*
* This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
@@ -3648,7 +3693,6 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- struct obj_cgroup *objcg;
int memcg_id;
if (cgroup_memory_nokmem)
@@ -3661,14 +3705,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
if (memcg_id < 0)
return memcg_id;
- objcg = obj_cgroup_alloc();
- if (!objcg) {
- memcg_free_cache_id(memcg_id);
- return -ENOMEM;
- }
- objcg->memcg = memcg;
- rcu_assign_pointer(memcg->objcg, objcg);
-
static_branch_enable(&memcg_kmem_enabled_key);
memcg->kmemcg_id = memcg_id;
@@ -3692,7 +3728,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (!parent)
parent = root_mem_cgroup;
- memcg_reparent_objcgs(memcg, parent);
+ memcg_reparent_objcgs(memcg);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -5192,6 +5228,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ memcg_obj_cgroup_free(memcg);
memcg_wb_domain_exit(memcg);
/*
* Flush percpu vmstats and vmevents to guarantee the value correctness
@@ -5242,6 +5279,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
goto fail;
+ if (memcg_obj_cgroup_alloc(memcg))
+ goto free_wb;
+
INIT_WORK(&memcg->high_work, high_work_func);
INIT_LIST_HEAD(&memcg->oom_notify);
mutex_init(&memcg->thresholds_lock);
@@ -5252,8 +5292,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->objcg_list);
#endif
+ INIT_LIST_HEAD(&memcg->objcg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
@@ -5267,6 +5307,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
+free_wb:
+ memcg_wb_domain_exit(memcg);
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
@@ -5304,6 +5346,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->tcpmem, NULL);
root_mem_cgroup = memcg;
+ root_obj_cgroup = memcg->objcg;
return &memcg->css;
}
--
2.11.0
next prev parent reply other threads:[~2021-03-30 10:21 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-03-30 10:15 [RFC PATCH 00/15] Use obj_cgroup APIs to charge the LRU pages Muchun Song
2021-03-30 10:15 ` [RFC PATCH 01/15] mm: memcontrol: fix page charging in page replacement Muchun Song
2021-04-02 15:07 ` Johannes Weiner
2021-03-30 10:15 ` [RFC PATCH 02/15] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm Muchun Song
2021-04-02 15:08 ` Johannes Weiner
2021-03-30 10:15 ` [RFC PATCH 03/15] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec Muchun Song
2021-04-02 15:22 ` Johannes Weiner
2021-03-30 10:15 ` [RFC PATCH 04/15] mm: memcontrol: use lruvec_memcg in lruvec_holds_page_lru_lock Muchun Song
2021-04-02 16:21 ` Johannes Weiner
2021-04-03 12:37 ` [External] " Muchun Song
2021-03-30 10:15 ` [RFC PATCH 05/15] mm: memcontrol: simplify the logic of objcg pinning memcg Muchun Song
2021-03-30 10:15 ` Muchun Song [this message]
2021-03-30 10:15 ` [RFC PATCH 07/15] mm: memcontrol: introduce compact_lock_page_lruvec_irqsave Muchun Song
2021-03-30 10:15 ` [RFC PATCH 08/15] mm: memcontrol: make lruvec lock safe when the LRU pages reparented Muchun Song
2021-03-30 10:15 ` [RFC PATCH 09/15] mm: thp: introduce lock/unlock_split_queue{_irqsave}() Muchun Song
2021-03-30 10:15 ` [RFC PATCH 10/15] mm: thp: make deferred split queue lock safe when the LRU pages reparented Muchun Song
2021-03-30 10:15 ` [RFC PATCH 11/15] mm: memcontrol: make all the callers of page_memcg() safe Muchun Song
2021-03-30 10:15 ` [RFC PATCH 12/15] mm: memcontrol: introduce memcg_reparent_ops Muchun Song
2021-03-30 10:15 ` [RFC PATCH 13/15] mm: memcontrol: use obj_cgroup APIs to charge the LRU pages Muchun Song
2021-03-30 10:15 ` [RFC PATCH 14/15] mm: memcontrol: rename {un}lock_page_memcg() to {un}lock_page_objcg() Muchun Song
2021-03-30 10:15 ` [RFC PATCH 15/15] mm: lru: add VM_BUG_ON_PAGE to lru maintenance function Muchun Song
2021-03-30 18:34 ` [RFC PATCH 00/15] Use obj_cgroup APIs to charge the LRU pages Shakeel Butt
2021-03-30 18:58 ` Roman Gushchin
2021-03-30 21:30 ` Johannes Weiner
2021-03-30 22:05 ` Roman Gushchin
2021-03-31 15:17 ` Johannes Weiner
2021-04-01 16:07 ` [External] " Muchun Song
2021-04-01 17:15 ` Shakeel Butt
2021-04-02 3:14 ` Muchun Song
2021-04-02 17:30 ` Johannes Weiner
2021-04-01 21:34 ` Roman Gushchin
2021-04-01 22:55 ` Yang Shi
2021-04-02 4:03 ` [External] " Muchun Song
2021-03-30 21:10 ` Johannes Weiner
2021-03-31 0:28 ` Shakeel Butt
2021-03-31 3:28 ` Balbir Singh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210330101531.82752-7-songmuchun@bytedance.com \
--to=songmuchun@bytedance.com \
--cc=akpm@linux-foundation.org \
--cc=duanxiongchun@bytedance.com \
--cc=guro@fb.com \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=shakeelb@google.com \
--cc=vdavydov.dev@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox