linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim
@ 2025-10-11  6:20 Wupeng Ma
  2025-10-15  9:08 ` mawupeng
  0 siblings, 1 reply; 2+ messages in thread
From: Wupeng Ma @ 2025-10-11  6:20 UTC (permalink / raw)
  To: akpm, david, jackmanb, hannes, zhengqi.arch, shakeel.butt
  Cc: mawupeng1, linux-mm, linux-kernel

During testing, we observed that memory allocation with node_reclaim_mode
enabled becomes extremely slow when a large allocation is attempted on a
node whose free memory is mostly occupied by clean page cache.

The slowness arises because during node reclaim, only direct reclaim-like
behavior is triggered — recycling only 32 pages at a time — without
waking kswapd, even when the watermark levels and alloc_flags already
satisfy the condition to activate kswapd.

This patch wakes kswapd during node reclaim, allowing background reclaim
to bring free memory up to the high watermark and avoid excessive node
reclaim overhead.

Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
---
 mm/internal.h   | 14 ++++++++------
 mm/page_alloc.c |  6 +++++-
 mm/vmscan.c     | 19 +++++++++++++++++--
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..5303123dd0a8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1196,21 +1196,23 @@ static inline void mminit_verify_zonelist(void)
 }
 #endif /* CONFIG_DEBUG_MEMORY_INIT */
 
-#define NODE_RECLAIM_NOSCAN	-2
-#define NODE_RECLAIM_FULL	-1
-#define NODE_RECLAIM_SOME	0
-#define NODE_RECLAIM_SUCCESS	1
+#define NODE_RECLAIM_NOSCAN		-2
+#define NODE_RECLAIM_FULL		-1
+#define NODE_RECLAIM_SOME		0
+#define NODE_RECLAIM_SUCCESS		1
+#define NODE_RECLAIM_KSWAPD_SUCCESS	2
 
 #ifdef CONFIG_NUMA
 extern int node_reclaim_mode;
 
-extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order,
+		 int alloc_flags, struct zone *zone);
 extern int find_next_best_node(int node, nodemask_t *used_node_mask);
 #else
 #define node_reclaim_mode 0
 
 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
-				unsigned int order)
+		unsigned int order, int alloc_flags, struct zone *zone)
 {
 	return NODE_RECLAIM_NOSCAN;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 600d9e981c23..2472000cab78 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3859,7 +3859,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
 				continue;
 
-			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
+			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order,
+					   alloc_flags, zone);
 			switch (ret) {
 			case NODE_RECLAIM_NOSCAN:
 				/* did not scan */
@@ -3867,6 +3868,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 			case NODE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
+			case NODE_RECLAIM_KSWAPD_SUCCESS:
+				/* kswapd reclaim enough */
+				goto try_this_zone;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3d..ebee8b6330a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7680,9 +7680,11 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
 	return sc->nr_reclaimed;
 }
 
-int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order,
+		 int alloc_flags, struct zone *zone)
 {
 	int ret;
+	enum zone_type highest_zoneidx = gfp_zone(gfp_mask);
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct scan_control sc = {
@@ -7693,7 +7695,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
-		.reclaim_idx = gfp_zone(gfp_mask),
+		.reclaim_idx = highest_zoneidx,
 	};
 
 	/*
@@ -7729,6 +7731,19 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
 		return NODE_RECLAIM_NOSCAN;
 
+	if (alloc_flags & ALLOC_KSWAPD) {
+		unsigned long mark;
+
+		wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
+
+		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
+		if (zone_watermark_ok(zone, order, mark, highest_zoneidx,
+					alloc_flags)) {
+			clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
+			return NODE_RECLAIM_KSWAPD_SUCCESS;
+		}
+	}
+
 	ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
 	clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 
-- 
2.43.0



^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim
  2025-10-11  6:20 [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim Wupeng Ma
@ 2025-10-15  9:08 ` mawupeng
  0 siblings, 0 replies; 2+ messages in thread
From: mawupeng @ 2025-10-15  9:08 UTC (permalink / raw)
  To: akpm, david, jackmanb, hannes, zhengqi.arch, shakeel.butt
  Cc: mawupeng1, linux-mm, linux-kernel, lorenzo.stoakes, Liam.Howlett,
	vbabka, rppt, surenb, mhocko, ziy, axelrasmussen, yuanchu,
	weixugc

Hi Reviewers:
	kindly ping, cc more reviews.
	
On 2025/10/11 14:20, Wupeng Ma wrote:
> During testing, we observed that memory allocation with node_reclaim_mode
> enabled becomes extremely slow when a large allocation is attempted on a
> node whose free memory is mostly occupied by clean page cache.
> 
> The slowness arises because during node reclaim, only direct reclaim-like
> behavior is triggered — recycling only 32 pages at a time — without
> waking kswapd, even when the watermark levels and alloc_flags already
> satisfy the condition to activate kswapd.
> 
> This patch wakes kswapd during node reclaim, allowing background reclaim
> to bring free memory up to the high watermark and avoid excessive node
> reclaim overhead.
> 
> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
> ---
>  mm/internal.h   | 14 ++++++++------
>  mm/page_alloc.c |  6 +++++-
>  mm/vmscan.c     | 19 +++++++++++++++++--
>  3 files changed, 30 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index 1561fc2ff5b8..5303123dd0a8 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1196,21 +1196,23 @@ static inline void mminit_verify_zonelist(void)
>  }
>  #endif /* CONFIG_DEBUG_MEMORY_INIT */
>  
> -#define NODE_RECLAIM_NOSCAN	-2
> -#define NODE_RECLAIM_FULL	-1
> -#define NODE_RECLAIM_SOME	0
> -#define NODE_RECLAIM_SUCCESS	1
> +#define NODE_RECLAIM_NOSCAN		-2
> +#define NODE_RECLAIM_FULL		-1
> +#define NODE_RECLAIM_SOME		0
> +#define NODE_RECLAIM_SUCCESS		1
> +#define NODE_RECLAIM_KSWAPD_SUCCESS	2
>  
>  #ifdef CONFIG_NUMA
>  extern int node_reclaim_mode;
>  
> -extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
> +int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order,
> +		 int alloc_flags, struct zone *zone);
>  extern int find_next_best_node(int node, nodemask_t *used_node_mask);
>  #else
>  #define node_reclaim_mode 0
>  
>  static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
> -				unsigned int order)
> +		unsigned int order, int alloc_flags, struct zone *zone)
>  {
>  	return NODE_RECLAIM_NOSCAN;
>  }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 600d9e981c23..2472000cab78 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3859,7 +3859,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			    !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
>  				continue;
>  
> -			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
> +			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order,
> +					   alloc_flags, zone);
>  			switch (ret) {
>  			case NODE_RECLAIM_NOSCAN:
>  				/* did not scan */
> @@ -3867,6 +3868,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			case NODE_RECLAIM_FULL:
>  				/* scanned but unreclaimable */
>  				continue;
> +			case NODE_RECLAIM_KSWAPD_SUCCESS:
> +				/* kswapd reclaim enough */
> +				goto try_this_zone;
>  			default:
>  				/* did we reclaim enough */
>  				if (zone_watermark_ok(zone, order, mark,
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b2fc8b626d3d..ebee8b6330a8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -7680,9 +7680,11 @@ static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
>  	return sc->nr_reclaimed;
>  }
>  
> -int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
> +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order,
> +		 int alloc_flags, struct zone *zone)
>  {
>  	int ret;
> +	enum zone_type highest_zoneidx = gfp_zone(gfp_mask);
>  	/* Minimum pages needed in order to stay on node */
>  	const unsigned long nr_pages = 1 << order;
>  	struct scan_control sc = {
> @@ -7693,7 +7695,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
>  		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
>  		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
>  		.may_swap = 1,
> -		.reclaim_idx = gfp_zone(gfp_mask),
> +		.reclaim_idx = highest_zoneidx,
>  	};
>  
>  	/*
> @@ -7729,6 +7731,19 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
>  	if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
>  		return NODE_RECLAIM_NOSCAN;
>  
> +	if (alloc_flags & ALLOC_KSWAPD) {
> +		unsigned long mark;
> +
> +		wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
> +
> +		mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
> +		if (zone_watermark_ok(zone, order, mark, highest_zoneidx,
> +					alloc_flags)) {
> +			clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
> +			return NODE_RECLAIM_KSWAPD_SUCCESS;
> +		}
> +	}
> +
>  	ret = __node_reclaim(pgdat, gfp_mask, nr_pages, &sc) >= nr_pages;
>  	clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
>  



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-10-15  9:08 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-10-11  6:20 [RFC PATCH] mm: vmscan: wakeup kswapd during node_reclaim Wupeng Ma
2025-10-15  9:08 ` mawupeng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox