linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Joshua Hahn <joshua.hahnjy@gmail.com>
To: Gregory Price <gourry@gourry.net>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Alistair Popple <apopple@nvidia.com>,
	Byungchul Park <byungchul@sk.com>,
	David Hildenbrand <david@redhat.com>,
	Matthew Brost <matthew.brost@intel.com>,
	Rakie Kim <rakie.kim@sk.com>,
	Ying Huang <ying.huang@linux.alibaba.com>,
	Zi Yan <ziy@nvidia.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	kernel-team@meta.com
Subject: [PATCH 2/2] mm/mempolicy: Skip extra call to __alloc_pages_bulk in weighted interleave
Date: Thu, 26 Jun 2025 13:09:34 -0700	[thread overview]
Message-ID: <20250626200936.3974420-3-joshua.hahnjy@gmail.com> (raw)
In-Reply-To: <20250626200936.3974420-1-joshua.hahnjy@gmail.com>

Currently, alloc_pages_bulk_weighted_interleave can make up to nr_node_ids+1
calls to __alloc_pages_bulk. The additional allocation can happen if the
previous call to this function finished the weighted round robin allocation
partially on a node. To make up for this, the next time this function is
called, an extra allocation is made to finish cleanly on the node boundaries
before performing the weighted round-robin cycles again.

Instead of making an additional call, we can calculate how many additional
pages should be allocated from the first node (aka carryover) and add that
value to the number of pages that should be allocated as part of the current
round-robin cycle.

Running a quick benchmark by compiling the kernel shows a small increase
in performance. These experiments were run on a machine with 2 nodes, each
with 125GB memory and 40 CPUs.

time numactl -w 0,1 make -j$(nproc)

+----------+---------+------------+---------+
| Time (s) |  6.16   | With patch | % Delta |
+----------+---------+------------+---------+
| Real     |  88.374 |    88.3356 | -0.2019 |
| User     |  3631.7 |   3636.263 |  0.0631 |
| Sys      | 366.029 |    363.792 | -0.7534 |
+----------+---------+------------+---------+

Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>

---
 mm/mempolicy.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 78ad74a0e249..0d693f96cf66 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2569,7 +2569,7 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 	unsigned long node_pages, delta;
 	u8 *weights, weight;
 	unsigned int weight_total = 0;
-	unsigned long rem_pages = nr_pages;
+	unsigned long rem_pages = nr_pages, carryover = 0;
 	nodemask_t nodes;
 	int nnodes, node;
 	int resume_node = MAX_NUMNODES - 1;
@@ -2594,18 +2594,12 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 	node = me->il_prev;
 	weight = me->il_weight;
 	if (weight && node_isset(node, nodes)) {
-		node_pages = min(rem_pages, weight);
-		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
-						  page_array);
-		page_array += nr_allocated;
-		total_allocated += nr_allocated;
-		/* if that's all the pages, no need to interleave */
 		if (rem_pages <= weight) {
-			me->il_weight -= rem_pages;
-			return total_allocated;
+			node_pages = rem_pages;
+			me->il_weight -= node_pages;
+			goto allocate;
 		}
-		/* Otherwise we adjust remaining pages, continue from there */
-		rem_pages -= weight;
+		carryover = weight;
 	}
 	/* clear active weight in case of an allocation failure */
 	me->il_weight = 0;
@@ -2614,7 +2608,7 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 	/* create a local copy of node weights to operate on outside rcu */
 	weights = kzalloc(nr_node_ids, GFP_KERNEL);
 	if (!weights)
-		return total_allocated;
+		return 0;
 
 	rcu_read_lock();
 	state = rcu_dereference(wi_state);
@@ -2634,16 +2628,17 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 	/*
 	 * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
 	 * Track which node weighted interleave should resume from.
+	 * Account for carryover. It is always allocated from the first node.
 	 *
 	 * if (rounds > 0) and (delta == 0), resume_node will always be
 	 * the node following prev_node and its weight.
 	 */
-	rounds = rem_pages / weight_total;
-	delta = rem_pages % weight_total;
+	rounds = (rem_pages - carryover) / weight_total;
+	delta = (rem_pages - carryover) % weight_total;
 	resume_node = next_node_in(prev_node, nodes);
 	resume_weight = weights[resume_node];
+	node = carryover ? prev_node : next_node_in(prev_node, nodes);
 	for (i = 0; i < nnodes; i++) {
-		node = next_node_in(prev_node, nodes);
 		weight = weights[node];
 		/* when delta is depleted, resume from that node */
 		if (delta && delta < weight) {
@@ -2651,12 +2646,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 			resume_weight = weight - delta;
 		}
 		/* Add the node's portion of the delta, if there is one */
-		node_pages = weight * rounds + min(delta, weight);
+		node_pages = weight * rounds + min(delta, weight) + carryover;
 		delta -= min(delta, weight);
+		carryover = 0;
 
 		/* node_pages can be 0 if an allocation fails and rounds == 0 */
 		if (!node_pages)
 			break;
+allocate:
 		nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
 						  page_array);
 		page_array += nr_allocated;
@@ -2664,10 +2661,14 @@ static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
 		if (total_allocated == nr_pages)
 			break;
 		prev_node = node;
+		node = next_node_in(prev_node, nodes);
+	}
+
+	if (weights) {
+		me->il_prev = resume_node;
+		me->il_weight = resume_weight;
+		kfree(weights);
 	}
-	me->il_prev = resume_node;
-	me->il_weight = resume_weight;
-	kfree(weights);
 	return total_allocated;
 }
 
-- 
2.47.1


  parent reply	other threads:[~2025-06-26 20:09 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-26 20:09 [PATCH 0/2] mm/mempolicy: Cleanup and optimization for " Joshua Hahn
2025-06-26 20:09 ` [PATCH 1/2] mm/mempolicy: Simplify weighted interleave bulk alloc calculations Joshua Hahn
2025-06-26 21:51   ` David Hildenbrand
2025-06-27  4:31   ` Gregory Price
2025-06-27  7:38   ` Rakie Kim
2025-06-27  7:45   ` Oscar Salvador
2025-06-26 20:09 ` Joshua Hahn [this message]
2025-06-27  4:28   ` [PATCH 2/2] mm/mempolicy: Skip extra call to __alloc_pages_bulk in weighted interleave Gregory Price
2025-06-27 16:13     ` Joshua Hahn
2025-06-30 15:39       ` Joshua Hahn
2025-06-30 20:05   ` Kees Bakker
2025-06-30 20:21     ` Joshua Hahn
2025-06-30 22:35       ` Andrew Morton
2025-06-30 23:01         ` Joshua Hahn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250626200936.3974420-3-joshua.hahnjy@gmail.com \
    --to=joshua.hahnjy@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=apopple@nvidia.com \
    --cc=byungchul@sk.com \
    --cc=david@redhat.com \
    --cc=gourry@gourry.net \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.brost@intel.com \
    --cc=rakie.kim@sk.com \
    --cc=ying.huang@linux.alibaba.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox