On 14/08/25 07:18PM, Bharata B Rao wrote: >This introduces a sub-system for collecting memory access >information from different sources. It maintains the hotness >information based on the access history and time of access. > >Additionally, it provides per-lowertier-node kernel threads >(named kpromoted) that periodically promote the pages that >are eligible for promotion. > >Sub-systems that generate hot page access info can report that >using this API: > >int pghot_record_access(u64 pfn, int nid, int src, > unsigned long time) > >@pfn: The PFN of the memory accessed >@nid: The accessing NUMA node ID >@src: The temperature source (sub-system) that generated the > access info >@time: The access time in jiffies > >Some temperature sources may not provide the nid from which >the page was accessed. This is true for sources that use >page table scanning for PTE Accessed bit. For such sources, >the default toptier node to which such pages should be promoted >is hard coded. > >Also, the access time provided some sources may at best be >considered approximate. This is especially true for hot pages >detected by PTE A bit scanning. > >The hot PFN records are stored in hash lists hashed by PFN value. >The PFN records that are categorized as hot enough to be promoted >are maintained in a per-lowertier-node max heap from which >kpromoted extracts and promotes them. > >Each record stores the following info: > >struct pghot_info { > unsigned long pfn; > > unsigned long last_update; /* Most recent access time */ > int frequency; /* Number of accesses within current window */ > int nid; /* Most recent access from this node */ > > struct hlist_node hnode; > size_t heap_idx; /* Position in max heap for quick retreival */ >}; > >The way in which a page is categorized as hot enough to be >promoted is pretty primitive now. > >Signed-off-by: Bharata B Rao >--- > include/linux/mmzone.h | 11 + > include/linux/pghot.h | 87 ++++++ > include/linux/vm_event_item.h | 9 + > mm/Kconfig | 11 + > mm/Makefile | 1 + > mm/mm_init.c | 10 + > mm/pghot.c | 501 ++++++++++++++++++++++++++++++++++ > mm/vmstat.c | 9 + > 8 files changed, 639 insertions(+) > create mode 100644 include/linux/pghot.h > create mode 100644 mm/pghot.c > >diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >index 0c5da9141983..f7094babed10 100644 >--- a/include/linux/mmzone.h >+++ b/include/linux/mmzone.h >@@ -1349,6 +1349,10 @@ struct memory_failure_stats { > }; > #endif > >+#ifdef CONFIG_PGHOT >+#include >+#endif >+ > /* > * On NUMA machines, each NUMA node would have a pg_data_t to describe > * it's memory layout. On UMA machines there is a single pglist_data which >@@ -1497,6 +1501,13 @@ typedef struct pglist_data { > #ifdef CONFIG_MEMORY_FAILURE > struct memory_failure_stats mf_stats; > #endif >+#ifdef CONFIG_PGHOT >+ struct task_struct *kpromoted; >+ wait_queue_head_t kpromoted_wait; >+ struct pghot_info **phi_buf; >+ struct max_heap heap; >+ spinlock_t heap_lock; >+#endif > } pg_data_t; > > #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) >diff --git a/include/linux/pghot.h b/include/linux/pghot.h >new file mode 100644 >index 000000000000..6b8496944e7f >--- /dev/null >+++ b/include/linux/pghot.h >@@ -0,0 +1,87 @@ >+/* SPDX-License-Identifier: GPL-2.0 */ >+#ifndef _LINUX_KPROMOTED_H >+#define _LINUX_KPROMOTED_H >+ >+#include >+#include >+#include >+ >+/* Page hotness temperature sources */ >+enum pghot_src { >+ PGHOT_HW_HINTS, >+ PGHOT_PGTABLE_SCAN, >+}; >+ >+#ifdef CONFIG_PGHOT >+ >+#define KPROMOTED_FREQ_WINDOW (5 * MSEC_PER_SEC) >+ >+/* 2 accesses within a window will make the page a promotion candidate */ >+#define KPRMOTED_FREQ_THRESHOLD 2 >+ >+/* >+ * The following two defines control the number of hash lists >+ * that are maintained for tracking PFN accesses. >+ */ >+#define PGHOT_HASH_PCT 50 /* % of lower tier memory pages to track */ >+#define PGHOT_HASH_ENTRIES 1024 /* Number of entries per list, ideal case */ >+ >+/* >+ * Percentage of hash entries that can reside in heap as migrate-ready >+ * candidates >+ */ >+#define PGHOT_HEAP_PCT 25 >+ >+#define KPRMOTED_MIGRATE_BATCH 1024 >+ >+/* >+ * If target NID isn't available, kpromoted promotes to node 0 >+ * by default. >+ * >+ * TODO: Need checks to validate that default node is indeed >+ * present and is a toptier node. >+ */ >+#define KPROMOTED_DEFAULT_NODE 0 >+ >+struct pghot_info { >+ unsigned long pfn; >+ >+ /* >+ * The following are the three fundamental parameters >+ * required to track the hotness of page/PFN. >+ * >+ * TODO: >+ * Check if these three can fit into a u32. >+ * With 3 bits for frequency (8 most recent accesses), >+ * 10 bits for nid (1024 nodes), the remaining 19 bits >+ * are available for timestamp. >+ */ >+ unsigned long last_update; /* Most recent access time */ >+ int frequency; /* Number of accesses within current window */ >+ int nid; /* Most recent access from this node */ >+ >+ struct hlist_node hnode; >+ size_t heap_idx; /* Position in max heap for quick retreival */ >+}; >+ >+struct max_heap { >+ size_t nr; >+ size_t size; >+ struct pghot_info **data; >+ DECLARE_FLEX_ARRAY(struct pghot_info *, preallocated); >+}; >+ >+/* >+ * The wakeup interval of kpromoted threads >+ */ >+#define KPROMOTE_DELAY 20 /* 20ms */ >+ >+int pghot_record_access(u64 pfn, int nid, int src, unsigned long now); >+#else >+static inline int pghot_record_access(u64 pfn, int nid, int src, >+ unsigned long now) >+{ >+ return 0; >+} >+#endif /* CONFIG_PGHOT */ >+#endif /* _LINUX_KPROMOTED_H */ >diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h >index 9e15a088ba38..9085e5c2d4aa 100644 >--- a/include/linux/vm_event_item.h >+++ b/include/linux/vm_event_item.h >@@ -186,6 +186,15 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, > KSTACK_REST, > #endif > #endif /* CONFIG_DEBUG_STACK_USAGE */ >+ PGHOT_RECORDED_ACCESSES, >+ PGHOT_RECORD_HWHINTS, >+ PGHOT_RECORD_PGTSCANS, >+ PGHOT_RECORDS_HASH, >+ PGHOT_RECORDS_HEAP, >+ KPROMOTED_RIGHT_NODE, >+ KPROMOTED_NON_LRU, >+ KPROMOTED_COLD_OLD, >+ KPROMOTED_DROPPED, > NR_VM_EVENT_ITEMS > }; > >diff --git a/mm/Kconfig b/mm/Kconfig >index e443fe8cd6cf..8b236eb874cf 100644 >--- a/mm/Kconfig >+++ b/mm/Kconfig >@@ -1381,6 +1381,17 @@ config PT_RECLAIM > > Note: now only empty user PTE page table pages will be reclaimed. > >+config PGHOT >+ bool "Hot page tracking and promotion" >+ def_bool y >+ depends on NUMA && MIGRATION && MMU >+ select MIN_HEAP >+ help >+ A sub-system to track page accesses in lower tier memory and >+ maintain hot page information. Promotes hot pages from lower >+ tiers to top tier by using the memory access information provided >+ by various sources. Asynchronous promotion is done by per-node >+ kernel threads. > > source "mm/damon/Kconfig" > >diff --git a/mm/Makefile b/mm/Makefile >index ef54aa615d9d..8799bd0c68ed 100644 >--- a/mm/Makefile >+++ b/mm/Makefile >@@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o > obj-$(CONFIG_EXECMEM) += execmem.o > obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o > obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o >+obj-$(CONFIG_PGHOT) += kpromoted.o Looks like by mistake used older file name. It should be pghot.o Can you please provide base commit. Unable to apply patch cleanly using b4 utility. Regards, Alok Rathore >diff --git a/mm/mm_init.c b/mm/mm_init.c >index 5c21b3af216b..f7992be3ff7f 100644 >--- a/mm/mm_init.c >+++ b/mm/mm_init.c >@@ -1402,6 +1402,15 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) > static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} > #endif > >+#ifdef CONFIG_PGHOT >+static void pgdat_init_kpromoted(struct pglist_data *pgdat) >+{ >+ init_waitqueue_head(&pgdat->kpromoted_wait); >+} >+#else >+static void pgdat_init_kpromoted(struct pglist_data *pgdat) {} >+#endif >+ > static void __meminit pgdat_init_internals(struct pglist_data *pgdat) > { > int i; >@@ -1411,6 +1420,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) > > pgdat_init_split_queue(pgdat); > pgdat_init_kcompactd(pgdat); >+ pgdat_init_kpromoted(pgdat); > > init_waitqueue_head(&pgdat->kswapd_wait); > init_waitqueue_head(&pgdat->pfmemalloc_wait); >diff --git a/mm/pghot.c b/mm/pghot.c >new file mode 100644 >index 000000000000..eadcf970c3ef >--- /dev/null >+++ b/mm/pghot.c >@@ -0,0 +1,501 @@ >+// SPDX-License-Identifier: GPL-2.0 >+/* >+ * Maintains information about hot pages from slower tier nodes and >+ * promotes them. >+ * >+ * Info about accessed pages are stored in hash lists indexed by PFN. >+ * Info about pages that are hot enough to be promoted are stored in >+ * a per-toptier-node max_heap. >+ * >+ * kpromoted is a kernel thread that runs on each toptier node and >+ * promotes pages from max_heap. >+ * >+ * TODO: >+ * - Compact pghot_info so that nid, time and frequency can fit >+ * - Scalar hotness value as a function frequency and recency >+ * - Possibility of moving migration rate limiting to kpromoted >+ */ >+#include >+#include >+#include >+#include >+#include >+#include >+#include >+#include >+#include >+#include >+ >+struct pghot_hash { >+ struct hlist_head hash; >+ spinlock_t lock; >+}; >+ >+static struct pghot_hash *phi_hash; >+static int phi_hash_order; >+static int phi_heap_entries; >+static struct kmem_cache *phi_cache __ro_after_init; >+static bool kpromoted_started __ro_after_init; >+ >+static bool phi_heap_less(const void *lhs, const void *rhs, void *args) >+{ >+ return (*(struct pghot_info **)lhs)->frequency > >+ (*(struct pghot_info **)rhs)->frequency; >+} >+ >+static void phi_heap_swp(void *lhs, void *rhs, void *args) >+{ >+ struct pghot_info **l = (struct pghot_info **)lhs; >+ struct pghot_info **r = (struct pghot_info **)rhs; >+ int lindex = l - (struct pghot_info **)args; >+ int rindex = r - (struct pghot_info **)args; >+ struct pghot_info *tmp = *l; >+ >+ *l = *r; >+ *r = tmp; >+ >+ (*l)->heap_idx = lindex; >+ (*r)->heap_idx = rindex; >+} >+ >+static const struct min_heap_callbacks phi_heap_cb = { >+ .less = phi_heap_less, >+ .swp = phi_heap_swp, >+}; >+ >+static void phi_heap_update_entry(struct max_heap *phi_heap, struct pghot_info *phi) >+{ >+ int orig_idx = phi->heap_idx; >+ >+ min_heap_sift_up(phi_heap, phi->heap_idx, &phi_heap_cb, >+ phi_heap->data); >+ if (phi_heap->data[phi->heap_idx]->heap_idx == orig_idx) >+ min_heap_sift_down(phi_heap, phi->heap_idx, >+ &phi_heap_cb, phi_heap->data); >+} >+ >+static bool phi_heap_insert(struct max_heap *phi_heap, struct pghot_info *phi) >+{ >+ if (phi_heap->nr >= phi_heap_entries) >+ return false; >+ >+ phi->heap_idx = phi_heap->nr; >+ min_heap_push(phi_heap, &phi, &phi_heap_cb, phi_heap->data); >+ >+ return true; >+} >+ >+static bool phi_is_pfn_hot(struct pghot_info *phi) >+{ >+ struct page *page = pfn_to_online_page(phi->pfn); >+ unsigned long now = jiffies; >+ struct folio *folio; >+ >+ if (!page || is_zone_device_page(page)) >+ return false; >+ >+ folio = page_folio(page); >+ if (!folio_test_lru(folio)) { >+ count_vm_event(KPROMOTED_NON_LRU); >+ return false; >+ } >+ if (folio_nid(folio) == phi->nid) { >+ count_vm_event(KPROMOTED_RIGHT_NODE); >+ return false; >+ } >+ >+ /* If the page was hot a while ago, don't promote */ >+ if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) { >+ count_vm_event(KPROMOTED_COLD_OLD); >+ return false; >+ } >+ return true; >+} >+ >+static struct folio *kpromoted_isolate_folio(struct pghot_info *phi) >+{ >+ struct page *page = pfn_to_page(phi->pfn); >+ struct folio *folio; >+ >+ if (!page) >+ return NULL; >+ >+ folio = page_folio(page); >+ if (migrate_misplaced_folio_prepare(folio, NULL, phi->nid)) >+ return NULL; >+ else >+ return folio; >+} >+ >+static struct pghot_info *phi_alloc(unsigned long pfn) >+{ >+ struct pghot_info *phi; >+ >+ phi = kmem_cache_zalloc(phi_cache, GFP_NOWAIT); >+ if (!phi) >+ return NULL; >+ >+ phi->pfn = pfn; >+ phi->heap_idx = -1; >+ return phi; >+} >+ >+static inline void phi_free(struct pghot_info *phi) >+{ >+ kmem_cache_free(phi_cache, phi); >+} >+ >+static int phi_heap_extract(pg_data_t *pgdat, int batch_count, int freq_th, >+ struct list_head *migrate_list, int *count) >+{ >+ spinlock_t *phi_heap_lock = &pgdat->heap_lock; >+ struct max_heap *phi_heap = &pgdat->heap; >+ int max_retries = 10; >+ int bkt, i = 0; >+ >+ if (batch_count < 0 || !migrate_list || !count || freq_th < 1 || >+ freq_th > KPRMOTED_FREQ_THRESHOLD) >+ return -EINVAL; >+ >+ *count = 0; >+ for (i = 0; i < batch_count; i++) { >+ struct pghot_info *top = NULL; >+ bool should_continue = false; >+ struct folio *folio; >+ int retries = 0; >+ >+ while (retries < max_retries) { >+ spin_lock(phi_heap_lock); >+ if (phi_heap->nr > 0 && phi_heap->data[0]->frequency >= freq_th) { >+ should_continue = true; >+ bkt = hash_min(phi_heap->data[0]->pfn, phi_hash_order); >+ top = phi_heap->data[0]; >+ } >+ spin_unlock(phi_heap_lock); >+ >+ if (!should_continue) >+ goto done; >+ >+ spin_lock(&phi_hash[bkt].lock); >+ spin_lock(phi_heap_lock); >+ if (phi_heap->nr == 0 || phi_heap->data[0] != top || >+ phi_heap->data[0]->frequency < freq_th) { >+ spin_unlock(phi_heap_lock); >+ spin_unlock(&phi_hash[bkt].lock); >+ retries++; >+ continue; >+ } >+ >+ top = phi_heap->data[0]; >+ hlist_del_init(&top->hnode); >+ >+ phi_heap->nr--; >+ if (phi_heap->nr > 0) { >+ phi_heap->data[0] = phi_heap->data[phi_heap->nr]; >+ phi_heap->data[0]->heap_idx = 0; >+ min_heap_sift_down(phi_heap, 0, &phi_heap_cb, >+ phi_heap->data); >+ } >+ >+ spin_unlock(phi_heap_lock); >+ spin_unlock(&phi_hash[bkt].lock); >+ >+ if (!phi_is_pfn_hot(top)) { >+ count_vm_event(KPROMOTED_DROPPED); >+ goto skip; >+ } >+ >+ folio = kpromoted_isolate_folio(top); >+ if (folio) { >+ list_add(&folio->lru, migrate_list); >+ (*count)++; >+ } >+skip: >+ phi_free(top); >+ break; >+ } >+ if (retries >= max_retries) { >+ pr_warn("%s: Too many retries\n", __func__); >+ break; >+ } >+ >+ } >+done: >+ return 0; >+} >+ >+static void phi_heap_add_or_adjust(struct pghot_info *phi) >+{ >+ pg_data_t *pgdat = NODE_DATA(phi->nid); >+ struct max_heap *phi_heap = &pgdat->heap; >+ >+ spin_lock(&pgdat->heap_lock); >+ if (phi->heap_idx >= 0 && phi->heap_idx < phi_heap->nr && >+ phi_heap->data[phi->heap_idx] == phi) { >+ /* Entry exists in heap */ >+ if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) { >+ /* Below threshold, remove from the heap */ >+ phi_heap->nr--; >+ if (phi->heap_idx < phi_heap->nr) { >+ phi_heap->data[phi->heap_idx] = >+ phi_heap->data[phi_heap->nr]; >+ phi_heap->data[phi->heap_idx]->heap_idx = >+ phi->heap_idx; >+ min_heap_sift_down(phi_heap, phi->heap_idx, >+ &phi_heap_cb, phi_heap->data); >+ } >+ phi->heap_idx = -1; >+ >+ } else { >+ /* Update position in heap */ >+ phi_heap_update_entry(phi_heap, phi); >+ } >+ } else if (phi->frequency >= KPRMOTED_FREQ_THRESHOLD) { >+ /* Add to the heap */ >+ if (phi_heap_insert(phi_heap, phi)) >+ count_vm_event(PGHOT_RECORDS_HEAP); >+ } >+ spin_unlock(&pgdat->heap_lock); >+} >+ >+static struct pghot_info *phi_lookup(unsigned long pfn, int bkt) >+{ >+ struct pghot_info *phi; >+ >+ hlist_for_each_entry(phi, &phi_hash[bkt].hash, hnode) { >+ if (phi->pfn == pfn) >+ return phi; >+ } >+ return NULL; >+} >+ >+/* >+ * Called by subsystems that generate page hotness/access information. >+ * >+ * @pfn: The PFN of the memory accessed >+ * @nid: The accessing NUMA node ID >+ * @src: The temperature source (sub-system) that generated the >+ * access info >+ * @time: The access time in jiffies >+ * >+ * Maintains the access records per PFN, classifies them as >+ * hot based on subsequent accesses and finally hands over >+ * them to kpromoted for migration. >+ */ >+int pghot_record_access(u64 pfn, int nid, int src, unsigned long now) >+{ >+ struct pghot_info *phi; >+ struct page *page; >+ struct folio *folio; >+ int bkt; >+ bool new_entry = false, new_window = false; >+ >+ if (!kpromoted_started) >+ return -EINVAL; >+ >+ count_vm_event(PGHOT_RECORDED_ACCESSES); >+ >+ switch (src) { >+ case PGHOT_HW_HINTS: >+ count_vm_event(PGHOT_RECORD_HWHINTS); >+ break; >+ case PGHOT_PGTABLE_SCAN: >+ count_vm_event(PGHOT_RECORD_PGTSCANS); >+ break; >+ default: >+ return -EINVAL; >+ } >+ >+ /* >+ * Record only accesses from lower tiers. >+ */ >+ if (node_is_toptier(pfn_to_nid(pfn))) >+ return 0; >+ >+ /* >+ * Reject the non-migratable pages right away. >+ */ >+ page = pfn_to_online_page(pfn); >+ if (!page || is_zone_device_page(page)) >+ return 0; >+ >+ folio = page_folio(page); >+ if (!folio_test_lru(folio)) >+ return 0; >+ >+ bkt = hash_min(pfn, phi_hash_order); >+ spin_lock(&phi_hash[bkt].lock); >+ phi = phi_lookup(pfn, bkt); >+ if (!phi) { >+ phi = phi_alloc(pfn); >+ if (!phi) >+ goto out; >+ new_entry = true; >+ } >+ >+ if (((now - phi->last_update) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) || >+ (nid != NUMA_NO_NODE && phi->nid != nid)) >+ new_window = true; >+ >+ if (new_entry || new_window) { >+ /* New window */ >+ phi->frequency = 1; /* TODO: Factor in the history */ >+ } else >+ phi->frequency++; >+ phi->last_update = now; >+ phi->nid = (nid == NUMA_NO_NODE) ? KPROMOTED_DEFAULT_NODE : nid; >+ >+ if (new_entry) { >+ /* Insert the new entry into hash table */ >+ hlist_add_head(&phi->hnode, &phi_hash[bkt].hash); >+ count_vm_event(PGHOT_RECORDS_HASH); >+ } else { >+ /* Add/update the position in heap */ >+ phi_heap_add_or_adjust(phi); >+ } >+out: >+ spin_unlock(&phi_hash[bkt].lock); >+ return 0; >+} >+ >+/* >+ * Extract the hot page records and batch-migrate the >+ * hot pages. >+ */ >+static void kpromoted_migrate(pg_data_t *pgdat) >+{ >+ int count, ret; >+ LIST_HEAD(migrate_list); >+ >+ /* >+ * Extract the top N elements from the heap that match >+ * the requested hotness threshold. >+ * >+ * PFNs ineligible from migration standpoint are removed >+ * from the heap and hash. >+ * >+ * Folios eligible for migration are isolated and returned >+ * in @migrate_list. >+ */ >+ ret = phi_heap_extract(pgdat, KPRMOTED_MIGRATE_BATCH, >+ KPRMOTED_FREQ_THRESHOLD, &migrate_list, &count); >+ if (ret) >+ return; >+ >+ if (!list_empty(&migrate_list)) >+ migrate_misplaced_folios_batch(&migrate_list, pgdat->node_id); >+} >+ >+static int kpromoted(void *p) >+{ >+ pg_data_t *pgdat = (pg_data_t *)p; >+ >+ while (!kthread_should_stop()) { >+ wait_event_timeout(pgdat->kpromoted_wait, false, >+ msecs_to_jiffies(KPROMOTE_DELAY)); >+ kpromoted_migrate(pgdat); >+ } >+ return 0; >+} >+ >+static int kpromoted_run(int nid) >+{ >+ pg_data_t *pgdat = NODE_DATA(nid); >+ int ret = 0; >+ >+ if (!node_is_toptier(nid)) >+ return 0; >+ >+ if (!pgdat->phi_buf) { >+ pgdat->phi_buf = vzalloc_node(phi_heap_entries * sizeof(struct pghot_info *), >+ nid); >+ if (!pgdat->phi_buf) >+ return -ENOMEM; >+ >+ min_heap_init(&pgdat->heap, pgdat->phi_buf, phi_heap_entries); >+ spin_lock_init(&pgdat->heap_lock); >+ } >+ >+ if (!pgdat->kpromoted) >+ pgdat->kpromoted = kthread_create_on_node(kpromoted, pgdat, nid, >+ "kpromoted%d", nid); >+ if (IS_ERR(pgdat->kpromoted)) { >+ ret = PTR_ERR(pgdat->kpromoted); >+ pgdat->kpromoted = NULL; >+ pr_info("Failed to start kpromoted%d, ret %d\n", nid, ret); >+ } else { >+ wake_up_process(pgdat->kpromoted); >+ } >+ return ret; >+} >+ >+static int __init pghot_init(void) >+{ >+ unsigned int hash_size; >+ size_t hash_entries; >+ size_t nr_pages = 0; >+ pg_data_t *pgdat; >+ int i, nid, ret; >+ >+ /* >+ * Arrive at the hash and heap sizes based on the >+ * number of pages present in the lower tier nodes. >+ */ >+ for_each_node_state(nid, N_MEMORY) { >+ if (!node_is_toptier(nid)) >+ nr_pages += NODE_DATA(nid)->node_present_pages; >+ } >+ >+ if (!nr_pages) >+ return 0; >+ >+ hash_entries = nr_pages * PGHOT_HASH_PCT / 100; >+ hash_size = hash_entries / PGHOT_HASH_ENTRIES; >+ phi_hash_order = ilog2(hash_size); >+ >+ phi_hash = vmalloc(sizeof(struct pghot_hash) * hash_size); >+ if (!phi_hash) { >+ ret = -ENOMEM; >+ goto out; >+ } >+ >+ for (i = 0; i < hash_size; i++) { >+ INIT_HLIST_HEAD(&phi_hash[i].hash); >+ spin_lock_init(&phi_hash[i].lock); >+ } >+ >+ phi_cache = KMEM_CACHE(pghot_info, 0); >+ if (unlikely(!phi_cache)) { >+ ret = -ENOMEM; >+ goto out; >+ } >+ >+ phi_heap_entries = hash_entries * PGHOT_HEAP_PCT / 100; >+ for_each_node_state(nid, N_CPU) { >+ ret = kpromoted_run(nid); >+ if (ret) >+ goto out_stop_kthread; >+ } >+ >+ kpromoted_started = true; >+ pr_info("pghot: Started page hotness monitoring and promotion thread\n"); >+ pr_info("pghot: nr_pages %ld hash_size %d hash_entries %ld hash_order %d heap_entries %d\n", >+ nr_pages, hash_size, hash_entries, phi_hash_order, phi_heap_entries); >+ return 0; >+ >+out_stop_kthread: >+ for_each_node_state(nid, N_CPU) { >+ pgdat = NODE_DATA(nid); >+ if (pgdat->kpromoted) { >+ kthread_stop(pgdat->kpromoted); >+ pgdat->kpromoted = NULL; >+ vfree(pgdat->phi_buf); >+ } >+ } >+out: >+ kmem_cache_destroy(phi_cache); >+ vfree(phi_hash); >+ return ret; >+} >+ >+late_initcall(pghot_init) >diff --git a/mm/vmstat.c b/mm/vmstat.c >index 71cd1ceba191..9edbdd71c6f7 100644 >--- a/mm/vmstat.c >+++ b/mm/vmstat.c >@@ -1496,6 +1496,15 @@ const char * const vmstat_text[] = { > #endif > #undef I > #endif /* CONFIG_VM_EVENT_COUNTERS */ >+ "pghot_recorded_accesses", >+ "pghot_recorded_hwhints", >+ "pghot_recorded_pgtscans", >+ "pghot_records_hash", >+ "pghot_records_heap", >+ "kpromoted_right_node", >+ "kpromoted_non_lru", >+ "kpromoted_cold_old", >+ "kpromoted_dropped", > }; > #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ > >-- >2.34.1 >