From: Raghavendra K T <raghavendra.kt@amd.com>
To: <raghavendra.kt@amd.com>
Cc: <AneeshKumar.KizhakeVeetil@arm.com>, <Michael.Day@amd.com>,
<akpm@linux-foundation.org>, <bharata@amd.com>,
<dave.hansen@intel.com>, <david@redhat.com>,
<dongjoo.linux.dev@gmail.com>, <feng.tang@intel.com>,
<gourry@gourry.net>, <hannes@cmpxchg.org>, <honggyu.kim@sk.com>,
<hughd@google.com>, <jhubbard@nvidia.com>, <jon.grimm@amd.com>,
<k.shutemov@gmail.com>, <kbusch@meta.com>,
<kmanaouil.dev@gmail.com>, <leesuyeon0506@gmail.com>,
<leillc@google.com>, <liam.howlett@oracle.com>,
<linux-kernel@vger.kernel.org>, <linux-mm@kvack.org>,
<mgorman@techsingularity.net>, <mingo@redhat.com>,
<nadav.amit@gmail.com>, <nphamcs@gmail.com>,
<peterz@infradead.org>, <riel@surriel.com>, <rientjes@google.com>,
<rppt@kernel.org>, <santosh.shukla@amd.com>, <shivankg@amd.com>,
<shy828301@gmail.com>, <sj@kernel.org>, <vbabka@suse.cz>,
<weixugc@google.com>, <willy@infradead.org>,
<ying.huang@linux.alibaba.com>, <ziy@nvidia.com>,
<Jonathan.Cameron@huawei.com>, <dave@stgolabs.net>,
<yuanchu@google.com>, <kinseyho@google.com>, <hdanton@sina.com>,
<harry.yoo@oracle.com>
Subject: [RFC PATCH V3 10/17] mm: Add a heuristic to calculate target node
Date: Thu, 14 Aug 2025 15:33:00 +0000 [thread overview]
Message-ID: <20250814153307.1553061-11-raghavendra.kt@amd.com> (raw)
In-Reply-To: <20250814153307.1553061-1-raghavendra.kt@amd.com>
One of the key challenges in PTE A bit based scanning is to find right
target node to promote to.
Here is a simple heuristic based approach:
1. While scanning pages of any mm, also scan toptier pages that belong
to that mm.
2. Accumulate the insight on the distribution of active pages on
toptier nodes.
3. Walk all the top-tier nodes and pick the one with highest accesses.
This method tries to consolidate application to a single node.
TBD: Create a list of preferred nodes for fallback when highest access
node is nearly full.
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
---
include/linux/mm_types.h | 4 +
mm/kscand.c | 198 +++++++++++++++++++++++++++++++++++++--
2 files changed, 192 insertions(+), 10 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d6b91e8a66d6..e3d8f11a5a04 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1148,6 +1148,10 @@ struct mm_struct {
/* numa_scan_seq prevents two threads remapping PTEs. */
int numa_scan_seq;
#endif
+#ifdef CONFIG_KSCAND
+ /* Tracks promotion node. XXX: use nodemask */
+ int target_node;
+ #endif
/*
* An operation with batched TLB flushing is going on. Anything
* that can move process memory needs to flush the TLB when
diff --git a/mm/kscand.c b/mm/kscand.c
index 880c3693866d..bf975e82357d 100644
--- a/mm/kscand.c
+++ b/mm/kscand.c
@@ -104,6 +104,7 @@ struct kscand_mm_slot {
unsigned long scan_size;
long address;
bool is_scanned;
+ int target_node;
};
/* Data structure to keep track of current mm under scan */
@@ -116,13 +117,23 @@ struct kscand_scan kscand_scan = {
.mm_head = LIST_HEAD_INIT(kscand_scan.mm_head),
};
+/* Per memory node information used to caclulate target_node for migration */
+struct kscand_nodeinfo {
+ unsigned long nr_scanned;
+ unsigned long nr_accessed;
+ int node;
+ bool is_toptier;
+};
+
/*
* Data structure passed to control scanning and also collect
* per memory node information
*/
struct kscand_scanctrl {
struct list_head scan_list;
+ struct kscand_nodeinfo *nodeinfo[MAX_NUMNODES];
unsigned long address;
+ unsigned long nr_to_scan;
};
struct kscand_scanctrl kscand_scanctrl;
@@ -218,15 +229,129 @@ static void kmigrated_wait_work(void)
migrate_sleep_jiffies);
}
-/*
- * Do not know what info to pass in the future to make
- * decision on taget node. Keep it void * now.
- */
+static unsigned long get_slowtier_accesed(struct kscand_scanctrl *scanctrl)
+{
+ int node;
+ unsigned long accessed = 0;
+
+ for_each_node_state(node, N_MEMORY) {
+ if (!node_is_toptier(node) && scanctrl->nodeinfo[node])
+ accessed += scanctrl->nodeinfo[node]->nr_accessed;
+ }
+ return accessed;
+}
+
+static inline unsigned long get_nodeinfo_nr_accessed(struct kscand_nodeinfo *ni)
+{
+ return ni->nr_accessed;
+}
+
+static inline void set_nodeinfo_nr_accessed(struct kscand_nodeinfo *ni, unsigned long val)
+{
+ ni->nr_accessed = val;
+}
+
+static inline unsigned long get_nodeinfo_nr_scanned(struct kscand_nodeinfo *ni)
+{
+ return ni->nr_scanned;
+}
+
+static inline void set_nodeinfo_nr_scanned(struct kscand_nodeinfo *ni, unsigned long val)
+{
+ ni->nr_scanned = val;
+}
+
+static inline void reset_nodeinfo_nr_scanned(struct kscand_nodeinfo *ni)
+{
+ set_nodeinfo_nr_scanned(ni, 0);
+}
+
+static inline void reset_nodeinfo(struct kscand_nodeinfo *ni)
+{
+ set_nodeinfo_nr_scanned(ni, 0);
+ set_nodeinfo_nr_accessed(ni, 0);
+}
+
+static void init_one_nodeinfo(struct kscand_nodeinfo *ni, int node)
+{
+ ni->nr_scanned = 0;
+ ni->nr_accessed = 0;
+ ni->node = node;
+ ni->is_toptier = node_is_toptier(node) ? true : false;
+}
+
+static struct kscand_nodeinfo *alloc_one_nodeinfo(int node)
+{
+ struct kscand_nodeinfo *ni;
+
+ ni = kzalloc(sizeof(*ni), GFP_KERNEL);
+
+ if (!ni)
+ return NULL;
+
+ init_one_nodeinfo(ni, node);
+
+ return ni;
+}
+
+/* TBD: Handle errors */
+static void init_scanctrl(struct kscand_scanctrl *scanctrl)
+{
+ struct kscand_nodeinfo *ni;
+ int node;
+
+ for_each_node(node) {
+ ni = alloc_one_nodeinfo(node);
+ if (!ni)
+ WARN_ON_ONCE(ni);
+ scanctrl->nodeinfo[node] = ni;
+ }
+}
+
+static void reset_scanctrl(struct kscand_scanctrl *scanctrl)
+{
+ int node;
+
+ for_each_node_state(node, N_MEMORY)
+ reset_nodeinfo(scanctrl->nodeinfo[node]);
+
+ /* XXX: Not rellay required? */
+ scanctrl->nr_to_scan = kscand_scan_size;
+}
+
+static void free_scanctrl(struct kscand_scanctrl *scanctrl)
+{
+ int node;
+
+ for_each_node(node)
+ kfree(scanctrl->nodeinfo[node]);
+}
+
static int kscand_get_target_node(void *data)
{
return kscand_target_node;
}
+static int get_target_node(struct kscand_scanctrl *scanctrl)
+{
+ int node, target_node = NUMA_NO_NODE;
+ unsigned long prev = 0;
+
+ for_each_node(node) {
+ if (node_is_toptier(node) && scanctrl->nodeinfo[node]) {
+ /* This creates a fallback migration node list */
+ if (get_nodeinfo_nr_accessed(scanctrl->nodeinfo[node]) > prev) {
+ prev = get_nodeinfo_nr_accessed(scanctrl->nodeinfo[node]);
+ target_node = node;
+ }
+ }
+ }
+ if (target_node == NUMA_NO_NODE)
+ target_node = kscand_get_target_node(NULL);
+
+ return target_node;
+}
+
extern bool migrate_balanced_pgdat(struct pglist_data *pgdat,
unsigned long nr_migrate_pages);
@@ -495,6 +620,14 @@ static int hot_vma_idle_pte_entry(pte_t *pte,
page_idle_clear_pte_refs(page, pte, walk);
srcnid = folio_nid(folio);
+ scanctrl->nodeinfo[srcnid]->nr_scanned++;
+ if (scanctrl->nr_to_scan)
+ scanctrl->nr_to_scan--;
+
+ if (!scanctrl->nr_to_scan) {
+ folio_put(folio);
+ return 1;
+ }
if (!folio_test_lru(folio)) {
folio_put(folio);
@@ -502,13 +635,17 @@ static int hot_vma_idle_pte_entry(pte_t *pte,
}
if (!kscand_eligible_srcnid(srcnid)) {
+ if (folio_test_young(folio) || folio_test_referenced(folio)
+ || pte_young(pteval)) {
+ scanctrl->nodeinfo[srcnid]->nr_accessed++;
+ }
folio_put(folio);
return 0;
}
if (!folio_test_idle(folio) && !prev_idle &&
(folio_test_young(folio) || folio_test_referenced(folio))) {
- /* XXX: Leaking memory. TBD: consume info */
+ scanctrl->nodeinfo[srcnid]->nr_accessed++;
info = kzalloc(sizeof(struct kscand_migrate_info), GFP_NOWAIT);
if (info && scanctrl) {
@@ -697,7 +834,13 @@ static void kmigrated_migrate_mm(struct kmigrated_mm_slot *mm_slot)
spin_unlock(&mm_slot->migrate_lock);
- dest = kscand_get_target_node(NULL);
+ if (!mmap_read_trylock(mm)) {
+ dest = kscand_get_target_node(NULL);
+ } else {
+ dest = READ_ONCE(mm->target_node);
+ mmap_read_unlock(mm);
+ }
+
ret = kmigrated_promote_folio(info, mm, dest);
kfree(info);
@@ -783,7 +926,7 @@ static void kmigrated_migrate_folio(void)
* Increase scan_size by (1 << SCAN_SIZE_CHANGE_SHIFT).
*/
static inline void kscand_update_mmslot_info(struct kscand_mm_slot *mm_slot,
- unsigned long total)
+ unsigned long total, int target_node)
{
unsigned int scan_period;
unsigned long now;
@@ -831,6 +974,7 @@ static inline void kscand_update_mmslot_info(struct kscand_mm_slot *mm_slot,
mm_slot->scan_period = scan_period;
mm_slot->scan_size = scan_size;
mm_slot->scan_delta = total;
+ mm_slot->target_node = target_node;
}
static unsigned long kscand_scan_mm_slot(void)
@@ -839,6 +983,7 @@ static unsigned long kscand_scan_mm_slot(void)
bool update_mmslot_info = false;
unsigned int mm_slot_scan_period;
+ int target_node, mm_slot_target_node, mm_target_node;
unsigned long now;
unsigned long mm_slot_next_scan;
unsigned long mm_slot_scan_size;
@@ -872,6 +1017,7 @@ static unsigned long kscand_scan_mm_slot(void)
mm_slot_next_scan = mm_slot->next_scan;
mm_slot_scan_period = mm_slot->scan_period;
mm_slot_scan_size = mm_slot->scan_size;
+ mm_slot_target_node = mm_slot->target_node;
spin_unlock(&kscand_mm_lock);
if (unlikely(!mmap_read_trylock(mm)))
@@ -882,6 +1028,9 @@ static unsigned long kscand_scan_mm_slot(void)
goto outerloop;
}
+ mm_target_node = READ_ONCE(mm->target_node);
+ if (mm_target_node != mm_slot_target_node)
+ WRITE_ONCE(mm->target_node, mm_slot_target_node);
now = jiffies;
if (mm_slot_next_scan && time_before(now, mm_slot_next_scan))
@@ -889,24 +1038,41 @@ static unsigned long kscand_scan_mm_slot(void)
VMA_ITERATOR(vmi, mm, address);
+ /* Either Scan 25% of scan_size or cover vma size of scan_size */
+ kscand_scanctrl.nr_to_scan = mm_slot_scan_size >> PAGE_SHIFT;
+ /* Reduce actual amount of pages scanned */
+ kscand_scanctrl.nr_to_scan = mm_slot_scan_size >> 1;
+
+ /* XXX: skip scanning to avoid duplicates until all migrations done? */
kmigrated_mm_slot = kmigrated_get_mm_slot(mm, false);
for_each_vma(vmi, vma) {
kscand_walk_page_vma(vma, &kscand_scanctrl);
vma_scanned_size += vma->vm_end - vma->vm_start;
- if (vma_scanned_size >= kscand_scan_size) {
+ if (vma_scanned_size >= mm_slot_scan_size ||
+ !kscand_scanctrl.nr_to_scan) {
next_mm = true;
if (!list_empty(&kscand_scanctrl.scan_list)) {
if (!kmigrated_mm_slot)
kmigrated_mm_slot = kmigrated_get_mm_slot(mm, true);
+ /* Add scanned folios to migration list */
spin_lock(&kmigrated_mm_slot->migrate_lock);
+
list_splice_tail_init(&kscand_scanctrl.scan_list,
&kmigrated_mm_slot->migrate_head);
spin_unlock(&kmigrated_mm_slot->migrate_lock);
+ break;
}
- break;
+ }
+ if (!list_empty(&kscand_scanctrl.scan_list)) {
+ if (!kmigrated_mm_slot)
+ kmigrated_mm_slot = kmigrated_get_mm_slot(mm, true);
+ spin_lock(&kmigrated_mm_slot->migrate_lock);
+ list_splice_tail_init(&kscand_scanctrl.scan_list,
+ &kmigrated_mm_slot->migrate_head);
+ spin_unlock(&kmigrated_mm_slot->migrate_lock);
}
}
@@ -917,9 +1083,19 @@ static unsigned long kscand_scan_mm_slot(void)
update_mmslot_info = true;
+ total = get_slowtier_accesed(&kscand_scanctrl);
+ target_node = get_target_node(&kscand_scanctrl);
+
+ mm_target_node = READ_ONCE(mm->target_node);
+
+ /* XXX: Do we need write lock? */
+ if (mm_target_node != target_node)
+ WRITE_ONCE(mm->target_node, target_node);
+ reset_scanctrl(&kscand_scanctrl);
+
if (update_mmslot_info) {
mm_slot->address = address;
- kscand_update_mmslot_info(mm_slot, total);
+ kscand_update_mmslot_info(mm_slot, total, target_node);
}
outerloop:
@@ -1113,6 +1289,7 @@ static int stop_kscand(void)
kthread_stop(kscand_thread);
kscand_thread = NULL;
}
+ free_scanctrl(&kscand_scanctrl);
return 0;
}
@@ -1168,6 +1345,7 @@ static inline void init_list(void)
spin_lock_init(&kscand_migrate_lock);
init_waitqueue_head(&kscand_wait);
init_waitqueue_head(&kmigrated_wait);
+ init_scanctrl(&kscand_scanctrl);
}
static int __init kscand_init(void)
--
2.34.1
next prev parent reply other threads:[~2025-08-14 15:35 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-14 15:32 [RFC PATCH V3 00/17] mm: slowtier page promotion based on PTE A bit Raghavendra K T
2025-08-14 15:32 ` [RFC PATCH V3 01/17] mm: Add kscand kthread for PTE A bit scan Raghavendra K T
2025-10-02 13:12 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 02/17] mm: Maintain mm_struct list in the system Raghavendra K T
2025-10-02 13:23 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 03/17] mm: Scan the mm and create a migration list Raghavendra K T
2025-10-02 13:53 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 04/17] mm/kscand: Add only hot pages to " Raghavendra K T
2025-10-02 16:00 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 05/17] mm: Create a separate kthread for migration Raghavendra K T
2025-10-02 16:03 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 06/17] mm/migration: migrate accessed folios to toptier node Raghavendra K T
2025-10-02 16:17 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 07/17] mm: Add throttling of mm scanning using scan_period Raghavendra K T
2025-10-02 16:24 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 08/17] mm: Add throttling of mm scanning using scan_size Raghavendra K T
2025-10-03 9:35 ` Jonathan Cameron
2025-08-14 15:32 ` [RFC PATCH V3 09/17] mm: Add initial scan delay Raghavendra K T
2025-10-03 9:41 ` Jonathan Cameron
2025-08-14 15:33 ` Raghavendra K T [this message]
2025-10-03 10:04 ` [RFC PATCH V3 10/17] mm: Add a heuristic to calculate target node Jonathan Cameron
2025-08-14 15:33 ` [RFC PATCH V3 11/17] mm/kscand: Implement migration failure feedback Raghavendra K T
2025-10-03 10:10 ` Jonathan Cameron
2025-08-14 15:33 ` [RFC PATCH V3 12/17] sysfs: Add sysfs support to tune scanning Raghavendra K T
2025-10-03 10:25 ` Jonathan Cameron
2025-08-14 15:33 ` [RFC PATCH V3 13/17] mm/vmstat: Add vmstat counters Raghavendra K T
2025-08-14 15:33 ` [RFC PATCH V3 14/17] trace/kscand: Add tracing of scanning and migration Raghavendra K T
2025-10-03 10:28 ` Jonathan Cameron
2025-08-14 15:33 ` [RFC PATCH V3 15/17] prctl: Introduce new prctl to control scanning Raghavendra K T
2025-08-14 15:33 ` [RFC PATCH V3 16/17] prctl: Fine tune scan_period with prctl scale param Raghavendra K T
2025-08-14 15:33 ` [RFC PATCH V3 17/17] mm: Create a list of fallback target nodes Raghavendra K T
2025-08-21 15:24 ` [RFC PATCH V3 00/17] mm: slowtier page promotion based on PTE A bit Raghavendra K T
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250814153307.1553061-11-raghavendra.kt@amd.com \
--to=raghavendra.kt@amd.com \
--cc=AneeshKumar.KizhakeVeetil@arm.com \
--cc=Jonathan.Cameron@huawei.com \
--cc=Michael.Day@amd.com \
--cc=akpm@linux-foundation.org \
--cc=bharata@amd.com \
--cc=dave.hansen@intel.com \
--cc=dave@stgolabs.net \
--cc=david@redhat.com \
--cc=dongjoo.linux.dev@gmail.com \
--cc=feng.tang@intel.com \
--cc=gourry@gourry.net \
--cc=hannes@cmpxchg.org \
--cc=harry.yoo@oracle.com \
--cc=hdanton@sina.com \
--cc=honggyu.kim@sk.com \
--cc=hughd@google.com \
--cc=jhubbard@nvidia.com \
--cc=jon.grimm@amd.com \
--cc=k.shutemov@gmail.com \
--cc=kbusch@meta.com \
--cc=kinseyho@google.com \
--cc=kmanaouil.dev@gmail.com \
--cc=leesuyeon0506@gmail.com \
--cc=leillc@google.com \
--cc=liam.howlett@oracle.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mgorman@techsingularity.net \
--cc=mingo@redhat.com \
--cc=nadav.amit@gmail.com \
--cc=nphamcs@gmail.com \
--cc=peterz@infradead.org \
--cc=riel@surriel.com \
--cc=rientjes@google.com \
--cc=rppt@kernel.org \
--cc=santosh.shukla@amd.com \
--cc=shivankg@amd.com \
--cc=shy828301@gmail.com \
--cc=sj@kernel.org \
--cc=vbabka@suse.cz \
--cc=weixugc@google.com \
--cc=willy@infradead.org \
--cc=ying.huang@linux.alibaba.com \
--cc=yuanchu@google.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox