On 06/12/25 03:44PM, Bharata B Rao wrote: >Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING >mode of NUMA Balancing) does hot page detection (via hint faults), >hot page classification and eventual promotion, all by itself and >sits within the scheduler. > >With the new hot page tracking and promotion mechanism being >available, NUMA Balancing can limit itself to detection of >hot pages (via hint faults) and off-load rest of the >functionality to the common hot page tracking system. > >pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the >hot page info. In addition, the migration rate limiting and >dynamic threshold logic are moved to kmigrated so that the same >can be used for hot pages reported by other sources too. > >Signed-off-by: Bharata B Rao >--- a/mm/pghot.c >+++ b/mm/pghot.c >@@ -12,6 +12,9 @@ > * the hot pages. kmigrated runs for each lower tier node. It iterates > * over the node's PFNs and migrates pages marked for migration into > * their targeted nodes. >+ * >+ * Migration rate-limiting and dynamic threshold logic implementations >+ * were moved from NUMA Balancing mode 2. > */ > #include > #include >@@ -25,6 +28,8 @@ static unsigned int pghot_freq_threshold = PGHOT_DEFAULT_FREQ_THRESHOLD; > static unsigned int kmigrated_sleep_ms = KMIGRATED_DEFAULT_SLEEP_MS; > static unsigned int kmigrated_batch_nr = KMIGRATED_DEFAULT_BATCH_NR; > >+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ >+static unsigned int sysctl_pghot_promote_rate_limit = 65536; > static unsigned int sysctl_pghot_freq_window = PGHOT_DEFAULT_FREQ_WINDOW; > > static DEFINE_STATIC_KEY_FALSE(pghot_src_hwhints); >@@ -43,6 +48,14 @@ static const struct ctl_table pghot_sysctls[] = { > .proc_handler = proc_dointvec_minmax, > .extra1 = SYSCTL_ZERO, > }, >+ { >+ .procname = "pghot_promote_rate_limit_MBps", >+ .data = &sysctl_pghot_promote_rate_limit, >+ .maxlen = sizeof(unsigned int), >+ .mode = 0644, >+ .proc_handler = proc_dointvec_minmax, >+ .extra1 = SYSCTL_ZERO, >+ }, > }; > #endif > >@@ -137,8 +150,13 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now) > old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK; > old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK; > >- if (((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window)) >- || (nid != NUMA_NO_NODE && old_nid != nid)) >+ /* >+ * Bypass the new window logic for NUMA hint fault source >+ * as it is too slow in reporting accesses. >+ * TODO: Fix this. >+ */ >+ if ((((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window)) >+ && (src != PGHOT_HINT_FAULT)) || (nid != NUMA_NO_NODE && old_nid != nid)) > new_window = true; > > if (new_window) >@@ -166,6 +184,110 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now) > return 0; > } > >+/* >+ * For memory tiering mode, if there are enough free pages (more than >+ * enough watermark defined here) in fast memory node, to take full >+ * advantage of fast memory capacity, all recently accessed slow >+ * memory pages will be migrated to fast memory node without >+ * considering hot threshold. >+ */ >+static bool pgdat_free_space_enough(struct pglist_data *pgdat) >+{ >+ int z; >+ unsigned long enough_wmark; >+ >+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, >+ pgdat->node_present_pages >> 4); >+ for (z = pgdat->nr_zones - 1; z >= 0; z--) { >+ struct zone *zone = pgdat->node_zones + z; >+ >+ if (!populated_zone(zone)) >+ continue; >+ >+ if (zone_watermark_ok(zone, 0, >+ promo_wmark_pages(zone) + enough_wmark, >+ ZONE_MOVABLE, 0)) >+ return true; >+ } >+ return false; >+} >+ >+/* >+ * For memory tiering mode, too high promotion/demotion throughput may >+ * hurt application latency. So we provide a mechanism to rate limit >+ * the number of pages that are tried to be promoted. >+ */ >+static bool kmigrated_promotion_rate_limit(struct pglist_data *pgdat, unsigned long rate_limit, >+ int nr, unsigned long now_ms) >+{ >+ unsigned long nr_cand; >+ unsigned int start; >+ >+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); >+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); >+ start = pgdat->nbp_rl_start; >+ if (now_ms - start > MSEC_PER_SEC && >+ cmpxchg(&pgdat->nbp_rl_start, start, now_ms) == start) >+ pgdat->nbp_rl_nr_cand = nr_cand; >+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) >+ return true; >+ return false; >+} >+ >+static void kmigrated_promotion_adjust_threshold(struct pglist_data *pgdat, >+ unsigned long rate_limit, unsigned int ref_th, >+ unsigned long now_ms) >+{ >+ unsigned int start, th_period, unit_th, th; >+ unsigned long nr_cand, ref_cand, diff_cand; >+ >+ th_period = KMIGRATED_PROMOTION_THRESHOLD_WINDOW; >+ start = pgdat->nbp_th_start; >+ if (now_ms - start > th_period && >+ cmpxchg(&pgdat->nbp_th_start, start, now_ms) == start) { >+ ref_cand = rate_limit * >+ KMIGRATED_PROMOTION_THRESHOLD_WINDOW / MSEC_PER_SEC; >+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); >+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand; >+ unit_th = ref_th * 2 / KMIGRATED_MIGRATION_ADJUST_STEPS; >+ th = pgdat->nbp_threshold ? : ref_th; >+ if (diff_cand > ref_cand * 11 / 10) >+ th = max(th - unit_th, unit_th); >+ else if (diff_cand < ref_cand * 9 / 10) >+ th = min(th + unit_th, ref_th * 2); >+ pgdat->nbp_th_nr_cand = nr_cand; >+ pgdat->nbp_threshold = th; >+ } >+} >+ >+static bool kmigrated_should_migrate_memory(unsigned long nr_pages, unsigned long nid, >+ unsigned long time) >+{ >+ struct pglist_data *pgdat; >+ unsigned long rate_limit; >+ unsigned int th, def_th; >+ unsigned long now = jiffies; now = jiffies & PGHOT_TIME_MASK; >+ unsigned long now_ms = jiffies_to_msecs(now); >+ >+ pgdat = NODE_DATA(nid); >+ if (pgdat_free_space_enough(pgdat)) { >+ /* workload changed, reset hot threshold */ >+ pgdat->nbp_threshold = 0; >+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr_pages); >+ return true; >+ } >+ >+ def_th = sysctl_pghot_freq_window; >+ rate_limit = MB_TO_PAGES(sysctl_pghot_promote_rate_limit); >+ kmigrated_promotion_adjust_threshold(pgdat, rate_limit, def_th, now_ms); >+ >+ th = pgdat->nbp_threshold ? : def_th; >+ if (jiffies_to_msecs(now - time) >= th) Setting time in pfn hotness using PGHOT_TIME_MASK in pghot_record_access(). Therefore here also it should be calculated using PGHOT_TIME_MASK. Then it'll be right comparision. Regards, Alok Rathore