Re: [RFC PATCH v4 8/9] mm: sched: Move hot page promotion from NUMAB=2 to pghot tracking

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Alok Rathore <alok.rathore@samsung.com>
To: Bharata B Rao <bharata@amd.com>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Jonathan.Cameron@huawei.com, dave.hansen@intel.com,
	gourry@gourry.net, mgorman@techsingularity.net, mingo@redhat.com,
	peterz@infradead.org, raghavendra.kt@amd.com, riel@surriel.com,
	rientjes@google.com, sj@kernel.org, weixugc@google.com,
	willy@infradead.org, ying.huang@linux.alibaba.com,
	ziy@nvidia.com, dave@stgolabs.net, nifan.cxl@gmail.com,
	xuezhengchu@huawei.com, yiannis@zptcorp.com,
	akpm@linux-foundation.org, david@redhat.com, byungchul@sk.com,
	kinseyho@google.com, joshua.hahnjy@gmail.com, yuanchu@google.com,
	balbirs@nvidia.com, shivankg@amd.com, alokrathore20@gmail.com,
	gost.dev@samsung.com, cpgs@samsung.com
Subject: Re: [RFC PATCH v4 8/9] mm: sched: Move hot page promotion from NUMAB=2 to pghot tracking
Date: Mon, 22 Dec 2025 15:56:55 +0530	[thread overview]
Message-ID: <1983025922.01766400002783.JavaMail.epsvc@epcpadp1new> (raw)
In-Reply-To: <20251206101423.5004-9-bharata@amd.com>

[-- Attachment #1: Type: text/plain, Size: 6607 bytes --]

On 06/12/25 03:44PM, Bharata B Rao wrote:
>Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
>mode of NUMA Balancing) does hot page detection (via hint faults),
>hot page classification and eventual promotion, all by itself and
>sits within the scheduler.
>
>With the new hot page tracking and promotion mechanism being
>available, NUMA Balancing can limit itself to detection of
>hot pages (via hint faults) and off-load rest of the
>functionality to the common hot page tracking system.
>
>pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
>hot page info. In addition, the migration rate limiting and
>dynamic threshold logic are moved to kmigrated so that the same
>can be used for hot pages reported by other sources too.
>
>Signed-off-by: Bharata B Rao <bharata@amd.com>

<snip>

>--- a/mm/pghot.c
>+++ b/mm/pghot.c
>@@ -12,6 +12,9 @@
>  * the hot pages. kmigrated runs for each lower tier node. It iterates
>  * over the node's PFNs and  migrates pages marked for migration into
>  * their targeted nodes.
>+ *
>+ * Migration rate-limiting and dynamic threshold logic implementations
>+ * were moved from NUMA Balancing mode 2.
>  */
> #include <linux/mm.h>
> #include <linux/migrate.h>
>@@ -25,6 +28,8 @@ static unsigned int pghot_freq_threshold = PGHOT_DEFAULT_FREQ_THRESHOLD;
> static unsigned int kmigrated_sleep_ms = KMIGRATED_DEFAULT_SLEEP_MS;
> static unsigned int kmigrated_batch_nr = KMIGRATED_DEFAULT_BATCH_NR;
>
>+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
>+static unsigned int sysctl_pghot_promote_rate_limit = 65536;
> static unsigned int sysctl_pghot_freq_window = PGHOT_DEFAULT_FREQ_WINDOW;
>
> static DEFINE_STATIC_KEY_FALSE(pghot_src_hwhints);
>@@ -43,6 +48,14 @@ static const struct ctl_table pghot_sysctls[] = {
> 		.proc_handler   = proc_dointvec_minmax,
> 		.extra1         = SYSCTL_ZERO,
> 	},
>+	{
>+		.procname	= "pghot_promote_rate_limit_MBps",
>+		.data		= &sysctl_pghot_promote_rate_limit,
>+		.maxlen		= sizeof(unsigned int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec_minmax,
>+		.extra1		= SYSCTL_ZERO,
>+	},
> };
> #endif
>
>@@ -137,8 +150,13 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now)
> 		old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
> 		old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
>
>-		if (((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window))
>-		    || (nid != NUMA_NO_NODE && old_nid != nid))
>+		/*
>+		 * Bypass the new window logic for NUMA hint fault source
>+		 * as it is too slow in reporting accesses.
>+		 * TODO: Fix this.
>+		 */
>+		if ((((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window))
>+		    && (src != PGHOT_HINT_FAULT)) || (nid != NUMA_NO_NODE && old_nid != nid))
> 			new_window = true;
>
> 		if (new_window)
>@@ -166,6 +184,110 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now)
> 	return 0;
> }
>
>+/*
>+ * For memory tiering mode, if there are enough free pages (more than
>+ * enough watermark defined here) in fast memory node, to take full
>+ * advantage of fast memory capacity, all recently accessed slow
>+ * memory pages will be migrated to fast memory node without
>+ * considering hot threshold.
>+ */
>+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
>+{
>+	int z;
>+	unsigned long enough_wmark;
>+
>+	enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
>+			   pgdat->node_present_pages >> 4);
>+	for (z = pgdat->nr_zones - 1; z >= 0; z--) {
>+		struct zone *zone = pgdat->node_zones + z;
>+
>+		if (!populated_zone(zone))
>+			continue;
>+
>+		if (zone_watermark_ok(zone, 0,
>+				      promo_wmark_pages(zone) + enough_wmark,
>+				      ZONE_MOVABLE, 0))
>+			return true;
>+	}
>+	return false;
>+}
>+
>+/*
>+ * For memory tiering mode, too high promotion/demotion throughput may
>+ * hurt application latency.  So we provide a mechanism to rate limit
>+ * the number of pages that are tried to be promoted.
>+ */
>+static bool kmigrated_promotion_rate_limit(struct pglist_data *pgdat, unsigned long rate_limit,
>+					   int nr, unsigned long now_ms)
>+{
>+	unsigned long nr_cand;
>+	unsigned int start;
>+
>+	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
>+	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
>+	start = pgdat->nbp_rl_start;
>+	if (now_ms - start > MSEC_PER_SEC &&
>+	    cmpxchg(&pgdat->nbp_rl_start, start, now_ms) == start)
>+		pgdat->nbp_rl_nr_cand = nr_cand;
>+	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
>+		return true;
>+	return false;
>+}
>+
>+static void kmigrated_promotion_adjust_threshold(struct pglist_data *pgdat,
>+						 unsigned long rate_limit, unsigned int ref_th,
>+						 unsigned long now_ms)
>+{
>+	unsigned int start, th_period, unit_th, th;
>+	unsigned long nr_cand, ref_cand, diff_cand;
>+
>+	th_period = KMIGRATED_PROMOTION_THRESHOLD_WINDOW;
>+	start = pgdat->nbp_th_start;
>+	if (now_ms - start > th_period &&
>+	    cmpxchg(&pgdat->nbp_th_start, start, now_ms) == start) {
>+		ref_cand = rate_limit *
>+			KMIGRATED_PROMOTION_THRESHOLD_WINDOW / MSEC_PER_SEC;
>+		nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
>+		diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
>+		unit_th = ref_th * 2 / KMIGRATED_MIGRATION_ADJUST_STEPS;
>+		th = pgdat->nbp_threshold ? : ref_th;
>+		if (diff_cand > ref_cand * 11 / 10)
>+			th = max(th - unit_th, unit_th);
>+		else if (diff_cand < ref_cand * 9 / 10)
>+			th = min(th + unit_th, ref_th * 2);
>+		pgdat->nbp_th_nr_cand = nr_cand;
>+		pgdat->nbp_threshold = th;
>+	}
>+}
>+
>+static bool kmigrated_should_migrate_memory(unsigned long nr_pages, unsigned long nid,
>+					    unsigned long time)
>+{
>+	struct pglist_data *pgdat;
>+	unsigned long rate_limit;
>+	unsigned int th, def_th;
>+	unsigned long now = jiffies;

now = jiffies & PGHOT_TIME_MASK; 

>+	unsigned long now_ms = jiffies_to_msecs(now);
>+
>+	pgdat = NODE_DATA(nid);
>+	if (pgdat_free_space_enough(pgdat)) {
>+		/* workload changed, reset hot threshold */
>+		pgdat->nbp_threshold = 0;
>+		mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr_pages);
>+		return true;
>+	}
>+
>+	def_th = sysctl_pghot_freq_window;
>+	rate_limit = MB_TO_PAGES(sysctl_pghot_promote_rate_limit);
>+	kmigrated_promotion_adjust_threshold(pgdat, rate_limit, def_th, now_ms);
>+
>+	th = pgdat->nbp_threshold ? : def_th;
>+	if (jiffies_to_msecs(now - time) >= th)

Setting time in pfn hotness using PGHOT_TIME_MASK in pghot_record_access(). Therefore
here also it should be calculated using PGHOT_TIME_MASK. Then it'll be right comparision.

Regards,
Alok Rathore


[-- Attachment #2: Type: text/plain, Size: 0 bytes --]

next prev parent reply	other threads:[~2025-12-22 10:40 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-06 10:14 [RFC PATCH v4 0/9] mm: Hot page tracking and promotion infrastructure Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 1/9] mm: migrate: Allow misplaced migration without VMA too Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 2/9] migrate: implement migrate_misplaced_folios_batch Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 3/9] mm: Hot page tracking and promotion Bharata B Rao
     [not found]   ` <CGME20251222101745epcas5p43ca3a6a59efe996cd62769e8d57bb81d@epcas5p4.samsung.com>
2025-12-22 10:17     ` Alok Rathore
2026-01-01  8:54       ` Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 4/9] x86: ibs: In-kernel IBS driver for memory access profiling Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 5/9] x86: ibs: Enable IBS profiling for memory accesses Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 6/9] mm: mglru: generalize page table walk Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 7/9] mm: klruscand: use mglru scanning for page promotion Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 8/9] mm: sched: Move hot page promotion from NUMAB=2 to pghot tracking Bharata B Rao
     [not found]   ` <CGME20251222102716epcas5p45d0893afb074ef3fa4be0c912cd0e237@epcas5p4.samsung.com>
2025-12-22 10:26     ` Alok Rathore [this message]
2026-01-01  8:56       ` Bharata B Rao
2025-12-06 10:14 ` [RFC PATCH v4 9/9] mm: pghot: Add folio_mark_accessed() as hotness source Bharata B Rao
2026-01-13 14:24 ` [RFC PATCH v4 0/9] mm: Hot page tracking and promotion infrastructure Gregory Price
2026-01-13 14:38   ` Bharata B Rao
2026-01-13 14:40     ` Gregory Price

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1983025922.01766400002783.JavaMail.epsvc@epcpadp1new \
    --to=alok.rathore@samsung.com \
    --cc=Jonathan.Cameron@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=alokrathore20@gmail.com \
    --cc=balbirs@nvidia.com \
    --cc=bharata@amd.com \
    --cc=byungchul@sk.com \
    --cc=cpgs@samsung.com \
    --cc=dave.hansen@intel.com \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=gost.dev@samsung.com \
    --cc=gourry@gourry.net \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kinseyho@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mingo@redhat.com \
    --cc=nifan.cxl@gmail.com \
    --cc=peterz@infradead.org \
    --cc=raghavendra.kt@amd.com \
    --cc=riel@surriel.com \
    --cc=rientjes@google.com \
    --cc=shivankg@amd.com \
    --cc=sj@kernel.org \
    --cc=weixugc@google.com \
    --cc=willy@infradead.org \
    --cc=xuezhengchu@huawei.com \
    --cc=yiannis@zptcorp.com \
    --cc=ying.huang@linux.alibaba.com \
    --cc=yuanchu@google.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox