[PATCH] mm/fake-numa: per-phys node fake size

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Bruno Faccini <bfaccini@nvidia.com>
To: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org, akpm@linux-foundation.org, rppt@kernel.org,
	ziy@nvidia.com, ttabi@nvidia.com, jhubbard@nvidia.com,
	Bruno Faccini <bfaccini@nvidia.com>
Subject: [PATCH] mm/fake-numa: per-phys node fake size
Date: Sat, 21 Sep 2024 01:13:49 -0700	[thread overview]
Message-ID: <20240921081348.10016-1-bfaccini@nvidia.com> (raw)

Determine fake numa node size on a per-phys node basis to
handle cases where there are big differences of reserved
memory size inside physical nodes, this will allow to get
the expected number of nodes evenly interleaved.

Consider a system with 2 physical Numa nodes where almost
all reserved memory sits into a single node, computing the
fake-numa nodes (fake=N) size as the ratio of all
available/non-reserved memory can cause the inability to
create N/2 fake-numa nodes in the physical node.

Signed-off-by: Bruno Faccini <bfaccini@nvidia.com>
---
 mm/numa_emulation.c | 66 ++++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 27 deletions(-)

diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
index 031fb9961bf7..0c72c85cfc10 100644
--- a/mm/numa_emulation.c
+++ b/mm/numa_emulation.c
@@ -77,20 +77,19 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei,
 }
 
 /*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.
+ * Sets up nr_nodes fake nodes interleaved over all physical nodes
  *
  * Returns zero on success or negative on error.
  */
 static int __init split_nodes_interleave(struct numa_meminfo *ei,
 					 struct numa_meminfo *pi,
-					 u64 addr, u64 max_addr, int nr_nodes)
+					 int nr_nodes)
 {
 	nodemask_t physnode_mask = numa_nodes_parsed;
-	u64 size;
-	int big;
-	int nid = 0;
-	int i, ret;
+	int nid = 0, physnodes_with_mem = 0;
+	int i, ret, phys_blk;
+	static u64 sizes[MAX_NUMNODES] __initdata;
+	static int bigs[MAX_NUMNODES] __initdata;
 
 	if (nr_nodes <= 0)
 		return -1;
@@ -100,25 +99,41 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 		nr_nodes = MAX_NUMNODES;
 	}
 
-	/*
-	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
-	 * the division in ulong number of pages and convert back.
-	 */
-	size = max_addr - addr - mem_hole_size(addr, max_addr);
-	size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+	/* count physical nodes with memory */
+	for_each_node_mask(i, physnode_mask) {
+		phys_blk = emu_find_memblk_by_nid(i, pi);
+		if (phys_blk < 0)
+			continue;
+		physnodes_with_mem++;
+	}
 
 	/*
-	 * Calculate the number of big nodes that can be allocated as a result
-	 * of consolidating the remainder.
+	 * Calculate target fake nodes sizes for each physical node with memory.
+	 * x86_32 freaks on __udivdi3() so do the division in ulong number of
+	 * pages and convert back.
 	 */
-	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
-		FAKE_NODE_MIN_SIZE;
+	for_each_node_mask(i, physnode_mask) {
+		phys_blk = emu_find_memblk_by_nid(i, pi);
+		if (phys_blk < 0)
+			continue;
 
-	size &= FAKE_NODE_MIN_HASH_MASK;
-	if (!size) {
-		pr_err("Not enough memory for each node.  "
-			"NUMA emulation disabled.\n");
-		return -1;
+		sizes[i] = pi->blk[phys_blk].end - pi->blk[phys_blk].start -
+			   mem_hole_size(pi->blk[phys_blk].start, pi->blk[phys_blk].end);
+		sizes[i] = PFN_PHYS((unsigned long)(sizes[i] >> PAGE_SHIFT) /
+			   nr_nodes * physnodes_with_mem);
+
+		/*
+		 * Calculate the number of big nodes that can be allocated as a result
+		 * of consolidating the remainder.
+		 */
+		bigs[i] = ((sizes[i] & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / physnodes_with_mem /
+			  FAKE_NODE_MIN_SIZE;
+		sizes[i] &= FAKE_NODE_MIN_HASH_MASK;
+		if (!sizes[i]) {
+			pr_err("Not enough memory for each node inside physical numa node %d. NUMA emulation disabled.\n",
+			       i);
+			return -1;
+		}
 	}
 
 	/*
@@ -138,16 +150,16 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 			}
 			start = pi->blk[phys_blk].start;
 			limit = pi->blk[phys_blk].end;
-			end = start + size;
+			end = start + sizes[i];
 
-			if (nid < big)
+			if (nid < bigs[i])
 				end += FAKE_NODE_MIN_SIZE;
 
 			/*
 			 * Continue to add memory to this fake node if its
 			 * non-reserved memory is less than the per-node size.
 			 */
-			while (end - start - mem_hole_size(start, end) < size) {
+			while (end - start - mem_hole_size(start, end) < sizes[i]) {
 				end += FAKE_NODE_MIN_SIZE;
 				if (end > limit) {
 					end = limit;
@@ -169,7 +181,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 			 * next node, this one must extend to the end of the
 			 * physical node.
 			 */
-			if (limit - end - mem_hole_size(end, limit) < size)
+			if (limit - end - mem_hole_size(end, limit) < sizes[i])
 				end = limit;
 
 			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
@@ -432,7 +444,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 		unsigned long n;
 
 		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
-		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+		ret = split_nodes_interleave(&ei, &pi, n);
 	}
 	if (*emu_cmdline == ':')
 		emu_cmdline++;
-- 
2.34.1

next             reply	other threads:[~2024-09-21  8:15 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-21  8:13 Bruno Faccini [this message]
2024-09-24 10:40 ` Mike Rapoport
2024-09-24 15:27   ` Bruno Faccini
2024-09-25  9:28     ` Mike Rapoport
2024-09-29 15:43       ` Bruno Faccini
2024-10-01  7:15         ` Mike Rapoport

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240921081348.10016-1-bfaccini@nvidia.com \
    --to=bfaccini@nvidia.com \
    --cc=akpm@linux-foundation.org \
    --cc=jhubbard@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=rppt@kernel.org \
    --cc=ttabi@nvidia.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox