[RFC PATCH 4/5] numa: introduce numa balancer infrastructure

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: 王贇 <yun.wang@linux.alibaba.com>
To: Peter Zijlstra <peterz@infradead.org>,
	hannes@cmpxchg.org, mhocko@kernel.org, vdavydov.dev@gmail.com,
	Ingo Molnar <mingo@redhat.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [RFC PATCH 4/5] numa: introduce numa balancer infrastructure
Date: Mon, 22 Apr 2019 10:14:48 +0800	[thread overview]
Message-ID: <42f47daa-22bb-3c93-9939-1514eb3bbda4@linux.alibaba.com> (raw)
In-Reply-To: <209d247e-c1b2-3235-2722-dd7c1f896483@linux.alibaba.com>

Now we have the way to estimate and adjust numa preferred node for each
memcg, next problem is how to use them.

Usually one will bind workloads with cpuset.cpus, combined with cpuset.mems
or maybe better the memory policy to achieve numa bonus, however in complicated
scenery like combined type of workloads or cpushare way of isolation, this
kind of administration could make one crazy, what we need is a way to gain
numa bonus automatically, maybe not maximum but as much as possible.

This patch introduced basic API for kernel module to do numa adjustment,
later coming the numa balancer module to use them and try to gain numa bonus
as much as possible, automatically.

API including:
  * numa preferred control
  * memcg callback hook
  * memcg per-node page number acquire

Signed-off-by: Michael Wang <yun.wang@linux.alibaba.com>
---
 include/linux/memcontrol.h |  26 ++++++++++++
 mm/memcontrol.c            | 101 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0fd5eeb27c4f..7456b862d5a9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -200,6 +200,11 @@ struct memcg_stat_numa {
 	u64 exectime;
 };

+struct memcg_callback {
+	void (*init)(struct mem_cgroup *memcg);
+	void (*exit)(struct mem_cgroup *memcg);
+};
+
 #endif
 #if defined(CONFIG_SMP)
 struct memcg_padding {
@@ -337,6 +342,8 @@ struct mem_cgroup {
 	struct memcg_stat_numa __percpu *stat_numa;
 	s64 numa_preferred;
 	struct mutex numa_mutex;
+	void *numa_private;
+	struct list_head numa_list;
 #endif

 	struct mem_cgroup_per_node *nodeinfo[0];
@@ -851,6 +858,10 @@ extern void memcg_stat_numa_update(struct task_struct *p);
 extern int memcg_migrate_prep(int target_nid, int page_nid);
 extern int memcg_preferred_nid(struct task_struct *p, gfp_t gfp);
 extern struct page *alloc_page_numa_preferred(gfp_t gfp, unsigned int order);
+extern int register_memcg_callback(void *cb);
+extern int unregister_memcg_callback(void *cb);
+extern void config_numa_preferred(struct mem_cgroup *memcg, int nid);
+extern u64 memcg_numa_pages(struct mem_cgroup *memcg, int nid, u32 mask);
 #else
 static inline void memcg_stat_numa_update(struct task_struct *p)
 {
@@ -868,6 +879,21 @@ static inline struct page *alloc_page_numa_preferred(gfp_t gfp,
 {
 	return NULL;
 }
+static inline int register_memcg_callback(void *cb)
+{
+	return -EINVAL;
+}
+static inline int unregister_memcg_callback(void *cb)
+{
+	return -EINVAL;
+}
+static inline void config_numa_preferred(struct mem_cgroup *memcg, int nid)
+{
+}
+static inline u64 memcg_numa_pages(struct mem_cgroup *memcg, int nid, u32 mask)
+{
+	return 0;
+}
 #endif

 #else /* CONFIG_MEMCG */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1cb1e726430..dc232ecc904f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3525,6 +3525,102 @@ struct page *alloc_page_numa_preferred(gfp_t gfp, unsigned int order)
 	return __alloc_pages_node(pnid, gfp, order);
 }

+static struct memcg_callback *memcg_cb;
+
+static LIST_HEAD(memcg_cb_list);
+static DEFINE_MUTEX(memcg_cb_mutex);
+
+int register_memcg_callback(void *cb)
+{
+	int ret = 0;
+
+	mutex_lock(&memcg_cb_mutex);
+	if (memcg_cb || !cb) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	memcg_cb = (struct memcg_callback *)cb;
+	if (memcg_cb->init) {
+		struct mem_cgroup *memcg;
+
+		list_for_each_entry(memcg, &memcg_cb_list, numa_list)
+			memcg_cb->init(memcg);
+	}
+
+out:
+	mutex_unlock(&memcg_cb_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(register_memcg_callback);
+
+int unregister_memcg_callback(void *cb)
+{
+	int ret = 0;
+
+	mutex_lock(&memcg_cb_mutex);
+	if (!memcg_cb || memcg_cb != cb) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (memcg_cb->exit) {
+		struct mem_cgroup *memcg;
+
+		list_for_each_entry(memcg, &memcg_cb_list, numa_list)
+			memcg_cb->exit(memcg);
+	}
+	memcg_cb = NULL;
+
+out:
+	mutex_unlock(&memcg_cb_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(unregister_memcg_callback);
+
+void config_numa_preferred(struct mem_cgroup *memcg, int nid)
+{
+	mutex_lock(&memcg->numa_mutex);
+	memcg->numa_preferred = nid;
+	mutex_unlock(&memcg->numa_mutex);
+}
+EXPORT_SYMBOL(config_numa_preferred);
+
+u64 memcg_numa_pages(struct mem_cgroup *memcg, int nid, u32 mask)
+{
+	if (nid == NUMA_NO_NODE)
+		return mem_cgroup_nr_lru_pages(memcg, mask);
+	else
+		return mem_cgroup_node_nr_lru_pages(memcg, nid, mask);
+}
+EXPORT_SYMBOL(memcg_numa_pages);
+
+static void memcg_online_callback(struct mem_cgroup *memcg)
+{
+	mutex_lock(&memcg_cb_mutex);
+	list_add_tail(&memcg->numa_list, &memcg_cb_list);
+	if (memcg_cb && memcg_cb->init)
+		memcg_cb->init(memcg);
+	mutex_unlock(&memcg_cb_mutex);
+}
+
+static void memcg_offline_callback(struct mem_cgroup *memcg)
+{
+	mutex_lock(&memcg_cb_mutex);
+	if (memcg_cb && memcg_cb->exit)
+		memcg_cb->exit(memcg);
+	list_del_init(&memcg->numa_list);
+	mutex_unlock(&memcg_cb_mutex);
+}
+
+#else
+
+static void memcg_online_callback(struct mem_cgroup *memcg)
+{}
+
+static void memcg_offline_callback(struct mem_cgroup *memcg)
+{}
+
 #endif

 /* Universal VM events cgroup1 shows, original sort order */
@@ -4719,6 +4815,9 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	/* Online state pins memcg ID, memcg ID pins CSS */
 	refcount_set(&memcg->id.ref, 1);
 	css_get(css);
+
+	memcg_online_callback(memcg);
+
 	return 0;
 }

@@ -4727,6 +4826,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event, *tmp;

+	memcg_offline_callback(memcg);
+
 	/*
 	 * Unregister events and notify userspace.
 	 * Notify userspace about cgroup removing only after rmdir of cgroup
-- 
2.14.4.44.g2045bb6

next prev parent reply	other threads:[~2019-04-22  2:14 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-22  2:10 [RFC PATCH 0/5] NUMA Balancer Suite 王贇
2019-04-22  2:11 ` [RFC PATCH 1/5] numa: introduce per-cgroup numa balancing locality, statistic 王贇
2019-04-23  8:44   ` Peter Zijlstra
2019-04-23  9:14     ` 王贇
2019-04-23  8:46   ` Peter Zijlstra
2019-04-23  9:32     ` 王贇
2019-04-23  8:47   ` Peter Zijlstra
2019-04-23  9:33     ` 王贇
2019-04-23  9:46       ` Peter Zijlstra
2019-04-22  2:12 ` [RFC PATCH 2/5] numa: append per-node execution info in memory.numa_stat 王贇
2019-04-23  8:52   ` Peter Zijlstra
2019-04-23  9:36     ` 王贇
2019-04-23  9:46       ` Peter Zijlstra
2019-04-23 10:01         ` 王贇
2019-04-22  2:13 ` [RFC PATCH 3/5] numa: introduce per-cgroup preferred numa node 王贇
2019-04-23  8:55   ` Peter Zijlstra
2019-04-23  9:41     ` 王贇
2019-04-22  2:14 ` 王贇 [this message]
2019-04-22  2:21 ` [RFC PATCH 5/5] numa: numa balancer 王贇
2019-04-23  9:05   ` Peter Zijlstra
2019-04-23  9:59     ` 王贇
2019-04-22 14:34 ` [RFC PATCH 0/5] NUMA Balancer Suite 禹舟键
2019-04-23  2:14   ` 王贇
2019-07-03  3:26 ` [PATCH 0/4] per cpu cgroup numa suite 王贇
2019-07-03  3:28   ` [PATCH 1/4] numa: introduce per-cgroup numa balancing locality, statistic 王贇
2019-07-11 13:43     ` Peter Zijlstra
2019-07-12  3:15       ` 王贇
2019-07-11 13:47     ` Peter Zijlstra
2019-07-12  3:43       ` 王贇
2019-07-12  7:58         ` Peter Zijlstra
2019-07-12  9:11           ` 王贇
2019-07-12  9:42             ` Peter Zijlstra
2019-07-12 10:10               ` 王贇
2019-07-15  2:09                 ` 王贇
2019-07-15 12:10                 ` Michal Koutný
2019-07-16  2:41                   ` 王贇
2019-07-19 16:47                     ` Michal Koutný
2019-07-03  3:29   ` [PATCH 2/4] numa: append per-node execution info in memory.numa_stat 王贇
2019-07-11 13:45     ` Peter Zijlstra
2019-07-12  3:17       ` 王贇
2019-07-03  3:32   ` [PATCH 3/4] numa: introduce numa group per task group 王贇
2019-07-11 14:10     ` Peter Zijlstra
2019-07-12  4:03       ` 王贇
2019-07-03  3:34   ` [PATCH 4/4] numa: introduce numa cling feature 王贇
2019-07-08  2:25     ` [PATCH v2 " 王贇
2019-07-11 14:27     ` [PATCH " Peter Zijlstra
2019-07-12  3:10       ` 王贇
2019-07-12  7:53         ` Peter Zijlstra
2019-07-12  8:58           ` 王贇
2019-07-22  3:44             ` 王贇
2019-07-11  9:00   ` [PATCH 0/4] per cgroup numa suite 王贇
2019-07-16  3:38   ` [PATCH v2 0/4] per-cgroup " 王贇
2019-07-16  3:39     ` [PATCH v2 1/4] numa: introduce per-cgroup numa balancing locality statistic 王贇
2019-07-16  3:40     ` [PATCH v2 2/4] numa: append per-node execution time in cpu.numa_stat 王贇
2019-07-19 16:39       ` Michal Koutný
2019-07-22  2:36         ` 王贇
2019-07-16  3:41     ` [PATCH v2 3/4] numa: introduce numa group per task group 王贇
2019-07-25  2:33     ` [PATCH v2 0/4] per-cgroup numa suite 王贇
2019-08-06  1:33     ` 王贇

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42f47daa-22bb-3c93-9939-1514eb3bbda4@linux.alibaba.com \
    --to=yun.wang@linux.alibaba.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox