linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Gregory Price <gourry.memverge@gmail.com>
To: linux-kernel@vger.kernel.org
Cc: linux-cxl@vger.kernel.org, linux-mm@kvack.org,
	cgroups@vger.kernel.org, linux-doc@vger.kernel.org,
	ying.huang@intel.com, akpm@linux-foundation.org,
	mhocko@kernel.org, tj@kernel.org, lizefan.x@bytedance.com,
	hannes@cmpxchg.org, corbet@lwn.net, roman.gushchin@linux.dev,
	shakeelb@google.com, muchun.song@linux.dev,
	Gregory Price <gregory.price@memverge.com>
Subject: [RFC PATCH v4 1/3] mm/memcontrol: implement memcg.interleave_weights
Date: Wed,  8 Nov 2023 19:25:15 -0500	[thread overview]
Message-ID: <20231109002517.106829-2-gregory.price@memverge.com> (raw)
In-Reply-To: <20231109002517.106829-1-gregory.price@memverge.com>

Create an RCU-protected array of unsigned char[MAX_NUMNODES] where
interleave weights can be stored.  The intent of these weights are
to be used by mempolicy to implement weighted interleave for
bandwidth optimization.

Node weights assigned via cgroup/memory.interleave_weights

Example: Set a 3:1 weighting ratio for nodes 0 and 1 respectively.
  echo 0:3 > cgroup/memory.interleave_weights
  echo 1:1 > cgroup/memory.interleave_weights

Example output:
  cat cgroup/memory.interleave_weights
  0:3,1:1

Child cgroups inherit parent interleave weights and may override them.

To revert weights to inheriting from the parent, write "-1:0"

Example:
  echo -1:0 > cgroup/memory.interleave_weights

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/memcontrol.h |  31 +++++++
 mm/memcontrol.c            | 172 +++++++++++++++++++++++++++++++++++++
 2 files changed, 203 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..338a9dcda446 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,8 @@
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/numa.h>
+#include <linux/nodemask.h>
 
 struct mem_cgroup;
 struct obj_cgroup;
@@ -167,6 +169,15 @@ struct mem_cgroup_thresholds {
 	struct mem_cgroup_threshold_ary *spare;
 };
 
+/* For mempolicy information */
+struct mem_cgroup_mempolicy {
+	/*
+	 * When interleaving is applied, do allocations on each node by the
+	 * weight value.  Size is always MAX_NUMNODES. Protected by RCU.
+	 */
+	unsigned char *il_weights;
+};
+
 /*
  * Remember four most recent foreign writebacks with dirty pages in this
  * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
@@ -265,6 +276,12 @@ struct mem_cgroup {
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 
+	/* protect the mempolicy settings */
+	struct mutex mempolicy_lock;
+
+	/* mempolicy defaults for tasks */
+	struct mem_cgroup_mempolicy mempolicy;
+
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 
@@ -1159,6 +1176,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
+
+unsigned char mem_cgroup_get_il_weight(unsigned int nid);
+
+unsigned int mem_cgroup_get_il_weights(nodemask_t *nodes,
+				       unsigned char *weights);
+
 #else /* CONFIG_MEMCG */
 
 #define MEM_CGROUP_ID_SHIFT	0
@@ -1591,6 +1614,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 {
 	return 0;
 }
+
+static unsigned char mem_cgroup_get_il_weight(unsigned int nid) { return 0; }
+
+static unsigned int mem_cgroup_get_il_weights(nodemask_t *nodes,
+					      unsigned char *weights)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b009b233ab8..67e8c1767471 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5319,6 +5319,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	INIT_WORK(&memcg->high_work, high_work_func);
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	mutex_init(&memcg->thresholds_lock);
+	mutex_init(&memcg->mempolicy_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
@@ -7896,6 +7897,176 @@ static struct cftype zswap_files[] = {
 };
 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
 
+unsigned char mem_cgroup_get_il_weight(unsigned int nid)
+{
+	struct mem_cgroup *memcg;
+	unsigned char weight = 0;
+	unsigned char *weights;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	while (!mem_cgroup_is_root(memcg)) {
+		weights = rcu_dereference(memcg->mempolicy.il_weights);
+		if (weights) {
+			weight = weights[nid];
+			break;
+		}
+		memcg = parent_mem_cgroup(memcg);
+	}
+	rcu_read_unlock();
+
+	return weight;
+}
+
+unsigned int mem_cgroup_get_il_weights(nodemask_t *nodes,
+				       unsigned char *weights)
+{
+	struct mem_cgroup *memcg;
+	unsigned char *memcg_weights;
+	unsigned int nid;
+	unsigned int total = 0;
+	unsigned char weight;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	while (memcg && !mem_cgroup_is_root(memcg)) {
+		memcg_weights = rcu_dereference(memcg->mempolicy.il_weights);
+		if (!memcg_weights) {
+			memcg = parent_mem_cgroup(memcg);
+			continue;
+		}
+
+		for_each_node_mask(nid, *nodes) {
+			weight = memcg_weights[nid];
+			weights[nid] = weight ? weight : 1;
+			total += weights[nid];
+		}
+		break;
+	}
+	rcu_read_unlock();
+
+	return total;
+}
+
+static int mpol_ilw_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg;
+	unsigned char *weights;
+	unsigned int nid;
+	unsigned int count = 0;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_seq(m);
+
+	while (memcg && !mem_cgroup_is_root(memcg)) {
+		weights = rcu_dereference(memcg->mempolicy.il_weights);
+		if (weights)
+			break;
+		memcg = parent_mem_cgroup(memcg);
+	}
+	for_each_node(nid) {
+		seq_printf(m, "%s%d:%d", (count++ ? "," : ""), nid,
+			   weights ? weights[nid] : 1);
+	}
+	seq_putc(m, '\n');
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static ssize_t mpol_ilw_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct mem_cgroup *pmcg;
+	unsigned char *new_weights = NULL, *old_weights = NULL;
+	int node;
+	unsigned char weight;
+	ssize_t ret;
+	char *sep = memchr(buf, ':', nbytes);
+	bool parent_weights = false;
+
+	if (!sep || sep == buf || sep == (buf + nbytes - 1))
+		return -EINVAL;
+	*sep = '\0';
+
+	ret = kstrtoint(buf, 10, &node);
+	if (ret)
+		return ret;
+
+	ret = kstrtou8(sep + 1, 10, &weight);
+	if (ret)
+		return ret;
+
+	/*
+	 * if value is -1:0, clear weights and set pointer to NULL
+	 * this allows the parent cgroup settings to take over
+	 */
+	if (node == -1 && weight == 0)
+		goto set_weights;
+	else if (node < 0)
+		return -EINVAL;
+	else if (node >= MAX_NUMNODES || weight == 0)
+		return -EINVAL;
+
+	new_weights = kzalloc(sizeof(unsigned char)*MAX_NUMNODES, GFP_KERNEL);
+	if (!new_weights)
+		return -ENOMEM;
+set_weights:
+	/* acquire mutex and readlock so we can read from parents if needed */
+	mutex_lock(&memcg->mempolicy_lock);
+	rcu_read_lock();
+	old_weights = rcu_dereference(memcg->mempolicy.il_weights);
+
+	/* If we're clearing the weights, don't bother looking at old ones */
+	if (!new_weights)
+		goto swap_weights;
+
+	/* Check for parent weights to inherit */
+	pmcg = memcg;
+	while (!old_weights) {
+		pmcg = parent_mem_cgroup(pmcg);
+
+		if (!pmcg || mem_cgroup_is_root(pmcg))
+			break;
+		old_weights = rcu_dereference(pmcg->mempolicy.il_weights);
+		parent_weights = true;
+	}
+
+	/* Copy the old weights or default all nodes to 1 */
+	if (old_weights)
+		memcpy(new_weights, old_weights,
+		       sizeof(unsigned char)*MAX_NUMNODES);
+	else
+		memset(new_weights, 1,
+		       sizeof(unsigned char)*MAX_NUMNODES);
+	new_weights[node] = weight;
+
+swap_weights:
+	rcu_assign_pointer(memcg->mempolicy.il_weights, new_weights);
+
+	rcu_read_unlock();
+	synchronize_rcu();
+
+	/* If we are inheriting weights from the parent, do not free */
+	if (old_weights && !parent_weights)
+		kfree(old_weights);
+
+	mutex_unlock(&memcg->mempolicy_lock);
+
+	return nbytes;
+}
+
+static struct cftype mempolicy_files[] = {
+	{
+		.name = "interleave_weights",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = mpol_ilw_show,
+		.write = mpol_ilw_write,
+	},
+	{ }	/* terminate */
+};
+
 static int __init mem_cgroup_swap_init(void)
 {
 	if (mem_cgroup_disabled())
@@ -7906,6 +8077,7 @@ static int __init mem_cgroup_swap_init(void)
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
 #endif
+	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, mempolicy_files));
 	return 0;
 }
 subsys_initcall(mem_cgroup_swap_init);
-- 
2.39.1



  reply	other threads:[~2023-11-09  0:25 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-09  0:25 [RFC PATCH v4 0/3] memcg weighted interleave mempolicy control Gregory Price
2023-11-09  0:25 ` Gregory Price [this message]
2023-11-09  0:25 ` [RFC PATCH v4 2/3] mm/mempolicy: implement weighted interleave Gregory Price
2023-11-10 15:26   ` Ravi Jonnalagadda
2023-11-09  0:25 ` [RFC PATCH v4 3/3] Documentation: sysfs entries for cgroup.memory.interleave_weights Gregory Price
2023-11-09 10:02 ` [RFC PATCH v4 0/3] memcg weighted interleave mempolicy control Michal Hocko
2023-11-09 15:10   ` Gregory Price
2023-11-09 16:34   ` Gregory Price
2023-11-10  9:05     ` Michal Hocko
2023-11-10 21:24       ` Gregory Price
     [not found] ` <klhcqksrg7uvdrf6hoi5tegifycjltz2kx2d62hapmw3ulr7oa@woibsnrpgox4>
2023-11-09 22:48   ` John Groves
2023-11-10 22:05     ` tj
2023-11-10 22:29       ` Gregory Price
2023-11-11  3:05         ` tj
2023-11-11  3:42           ` Gregory Price
2023-11-11 11:16             ` tj
2023-11-11 23:54               ` Dan Williams
2023-11-13  2:22                 ` Gregory Price
2023-11-14  9:43             ` Michal Hocko
2023-11-14 15:50               ` Gregory Price
2023-11-14 17:01                 ` Michal Hocko
2023-11-14 17:49                   ` Gregory Price
2023-11-15  5:56                     ` Huang, Ying
2023-12-04  3:33                       ` Gregory Price
2023-12-04  8:19                         ` Huang, Ying
2023-12-04 13:50                           ` Gregory Price
2023-12-05  9:01                             ` Huang, Ying
2023-12-05 14:47                               ` Gregory Price
2023-12-06  0:50                                 ` Huang, Ying
2023-12-06  2:01                                   ` Gregory Price
2023-11-10  6:16 ` Huang, Ying
2023-11-10 19:54   ` Gregory Price
2023-11-13  1:31     ` Huang, Ying
2023-11-13  2:28       ` Gregory Price

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231109002517.106829-2-gregory.price@memverge.com \
    --to=gourry.memverge@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=gregory.price@memverge.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizefan.x@bytedance.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeelb@google.com \
    --cc=tj@kernel.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox