linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Ryan Roberts <ryan.roberts@arm.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
	"Huang, Ying" <ying.huang@intel.com>,
	Kalesh Singh <kaleshsingh@google.com>,
	Barry Song <baohua@kernel.org>, Hugh Dickins <hughd@google.com>,
	David Hildenbrand <david@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org
Subject: [RFC PATCH v1 5/5] mm: swap: Optimize per-order cluster scanning
Date: Wed, 19 Jun 2024 00:26:45 +0100	[thread overview]
Message-ID: <20240618232648.4090299-6-ryan.roberts@arm.com> (raw)
In-Reply-To: <20240618232648.4090299-1-ryan.roberts@arm.com>

Add CLUSTER_FLAG_SKIP_SCAN cluster flag, which is applied to a cluster
under 1 of 2 conditions. When present, the cluster will be skipped
during a scan.

- When the number of free entries is less than the number of entries
  that would be required for a new allocation of the order that the
  cluster serves.

- When scanning completes for the cluster, and no further scanners are
  active for the cluster and no swap entries were freed for the cluster
  since the last scan began. In this case, it has been proven that there
  are no contiguous free entries of sufficient size to allcoate the
  order that the cluster serves. In this case the cluster is made
  eligible for scanning again when the next entry is freed.

The latter is implemented to permit multiple CPUs to scan the same
cluster, which in turn garrantees that if there is a free block
available in a cluster allocated for the desired order then it will be
allocated on a first come, first served basis.

As a result, the number of active scanners for a cluster must be
tracked, costing 4 bytes per cluster.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 include/linux/swap.h |  3 +++
 mm/swapfile.c        | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 34ec4668a5c9..40c308749e79 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -257,9 +257,12 @@ struct swap_cluster_info {
 	unsigned int data:24;
 	unsigned int flags:4;
 	unsigned int order:4;
+	unsigned int nr_scanners;
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_FLAG_SKIP_SCAN 4 /* Skip cluster for per-order scan */
+#define CLUSTER_FLAG_DECREMENT 8 /* A swap entry was freed from cluster */

 /*
  * swap_info_struct::max is an unsigned int, so the maximum number of pages in
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 24db03db8830..caf382b4ecd3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -574,6 +574,9 @@ static void add_cluster_info_page(struct swap_info_struct *p,
 	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
 	cluster_set_count(&cluster_info[idx],
 		cluster_count(&cluster_info[idx]) + count);
+
+	if (SWAPFILE_CLUSTER - cluster_count(&cluster_info[idx]) < count)
+		cluster_info[idx].flags |= CLUSTER_FLAG_SKIP_SCAN;
 }

 /*
@@ -595,6 +598,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 	struct swap_cluster_info *cluster_info, unsigned long page_nr)
 {
 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+	unsigned long count = 1 << cluster_info[idx].order;

 	if (!cluster_info)
 		return;
@@ -603,6 +607,10 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 	cluster_set_count(&cluster_info[idx],
 		cluster_count(&cluster_info[idx]) - 1);

+	cluster_info[idx].flags |= CLUSTER_FLAG_DECREMENT;
+	if (SWAPFILE_CLUSTER - cluster_count(&cluster_info[idx]) >= count)
+		cluster_info[idx].flags &= ~CLUSTER_FLAG_SKIP_SCAN;
+
 	if (cluster_count(&cluster_info[idx]) == 0)
 		free_cluster(p, idx);
 }
@@ -708,7 +716,8 @@ static unsigned int next_cluster_for_scan(struct swap_info_struct *si,
 	end = offset_to_cluster(si, *stop);

 	while (ci != end) {
-		if ((ci->flags & CLUSTER_FLAG_FREE) == 0 && ci->order == order)
+		if ((ci->flags & (CLUSTER_FLAG_SKIP_SCAN | CLUSTER_FLAG_FREE)) == 0
+		    && ci->order == order)
 			break;
 		ci = next_cluster_circular(si, ci);
 	}
@@ -722,6 +731,21 @@ static unsigned int next_cluster_for_scan(struct swap_info_struct *si,
 	return cluster_to_offset(si, ci);
 }

+static inline void cluster_inc_scanners(struct swap_cluster_info *ci)
+{
+	/* Protected by si lock. */
+	ci->nr_scanners++;
+	ci->flags &= ~CLUSTER_FLAG_DECREMENT;
+}
+
+static inline void cluster_dec_scanners(struct swap_cluster_info *ci)
+{
+	/* Protected by si lock. */
+	ci->nr_scanners--;
+	if (ci->nr_scanners == 0 && (ci->flags & CLUSTER_FLAG_DECREMENT) == 0)
+		ci->flags |= CLUSTER_FLAG_SKIP_SCAN;
+}
+
 /*
  * Try to get swap entries with specified order from current cpu's swap entry
  * pool (a cluster). This might involve allocating a new cluster for current CPU
@@ -764,6 +788,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 				return false;
 		} else
 			return false;
+
+		cluster_inc_scanners(offset_to_cluster(si, tmp));
 	}

 	/*
@@ -780,13 +806,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 	}
 	unlock_cluster(ci);
 	if (tmp >= max) {
+		cluster_dec_scanners(ci);
 		cluster->next[order] = SWAP_NEXT_INVALID;
 		goto new_cluster;
 	}
 	*offset = tmp;
 	*scan_base = tmp;
 	tmp += nr_pages;
-	cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
+	if (tmp >= max) {
+		cluster_dec_scanners(ci);
+		cluster->next[order] = SWAP_NEXT_INVALID;
+	} else {
+		cluster->next[order] = tmp;
+	}
 	return true;
 }

--
2.43.0



  parent reply	other threads:[~2024-06-18 23:27 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-18 23:26 [RFC PATCH v1 0/5] Alternative mTHP swap allocator improvements Ryan Roberts
2024-06-18 23:26 ` [RFC PATCH v1 1/5] mm: swap: Simplify end-of-cluster calculation Ryan Roberts
2024-06-18 23:26 ` [RFC PATCH v1 2/5] mm: swap: Change SWAP_NEXT_INVALID to highest value Ryan Roberts
2024-06-18 23:26 ` [RFC PATCH v1 3/5] mm: swap: Track allocation order for clusters Ryan Roberts
2024-06-18 23:26 ` [RFC PATCH v1 4/5] mm: swap: Scan for free swap entries in allocated clusters Ryan Roberts
2024-06-18 23:26 ` Ryan Roberts [this message]
2024-06-19  7:19 ` [RFC PATCH v1 0/5] Alternative mTHP swap allocator improvements Huang, Ying
2024-06-19  9:17   ` Ryan Roberts
2024-06-20  1:34     ` Huang, Ying
2024-06-19  9:11 ` Barry Song
2024-06-19  9:17   ` Ryan Roberts
2024-06-21  8:48     ` Barry Song
2024-06-25  8:02 ` Ryan Roberts
2024-06-25 16:06   ` Chris Li
2024-06-25 16:36     ` Ryan Roberts

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240618232648.4090299-6-ryan.roberts@arm.com \
    --to=ryan.roberts@arm.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=chrisl@kernel.org \
    --cc=david@redhat.com \
    --cc=hughd@google.com \
    --cc=kaleshsingh@google.com \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox