On 12/22/25 15:58, Tianyou Li wrote:
When invoke move_pfn_range_to_zone or remove_pfn_range_from_zone, it will
update the zone->contiguous by checking the new zone's pfn range from the
beginning to the end, regardless the previous state of the old zone. When
the zone's pfn range is large, the cost of traversing the pfn range to
update the zone->contiguous could be significant.
Add fast paths to quickly detect cases where zone is definitely not
contiguous without scanning the new zone. The cases are: when the new range
did not overlap with previous range, the contiguous should be false; if the
new range adjacent with the previous range, just need to check the new
range; if the new added pages could not fill the hole of previous zone, the
contiguous should be false.
The following test cases of memory hotplug for a VM [1], tested in the
environment [2], show that this optimization can significantly reduce the
memory hotplug time [3].
+----------------+------+---------------+--------------+----------------+
| | Size | Time (before) | Time (after) | Time Reduction |
| +------+---------------+--------------+----------------+
| Plug Memory | 256G | 10s | 2s | 80% |
| +------+---------------+--------------+----------------+
| | 512G | 33s | 6s | 81% |
+----------------+------+---------------+--------------+----------------+
+----------------+------+---------------+--------------+----------------+
| | Size | Time (before) | Time (after) | Time Reduction |
| +------+---------------+--------------+----------------+
| Unplug Memory | 256G | 10s | 2s | 80% |
| +------+---------------+--------------+----------------+
| | 512G | 34s | 6s | 82% |
+----------------+------+---------------+--------------+----------------+
Again, very nice results.
[...]
Thanks David.
+static enum zone_contig_state zone_contig_state_after_shrinking(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ /*
+ * If the removed pfn range inside the original zone span, the contiguous
+ * property is surely false.
/*
* If we cut a hole into the zone span, then the zone is
* certainly not contiguous.
*/
Will change accordingly. Thanks.
+ */
+ if (start_pfn > zone->zone_start_pfn && end_pfn < zone_end_pfn(zone))
+ return ZONE_CONTIG_NO;
+
+ /* If the removed pfn range is at the beginning or end of the
+ * original zone span, the contiguous property is preserved when
+ * the original zone is contiguous.
/* Removing from the start/end of the zone will not change anything. */
Will change accordingly. Thanks.
+ */
+ if (start_pfn == zone->zone_start_pfn || end_pfn == zone_end_pfn(zone))
+ return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_MAYBE;
+
+ return ZONE_CONTIG_MAYBE;
+}
+
void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
@@ -551,6 +573,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long pfn, cur_nr_pages;
+ enum zone_contig_state new_contiguous_state = ZONE_CONTIG_MAYBE;
No need to initialize, given that you overwrite the value below.
Will change accordingly. Thanks.
/* Poison struct pages because they are now uninitialized again. */
for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
@@ -571,12 +594,14 @@ void remove_pfn_range_from_zone(struct zone *zone,
if (zone_is_zone_device(zone))
return;
+ new_contiguous_state = zone_contig_state_after_shrinking(zone, start_pfn,
+ nr_pages);
clear_zone_contiguous(zone);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, new_contiguous_state);
}
/**
@@ -736,6 +761,39 @@ static inline void section_taint_zone_device(unsigned long pfn)
}
#endif
+static enum zone_contig_state zone_contig_state_after_growing(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ if (zone_is_empty(zone))
+ return ZONE_CONTIG_YES;
+
+ /*
+ * If the moved pfn range does not intersect with the original zone spa
s/spa/span/
My mistake:(
Will change accordingly. Thanks.
+ * the contiguous property is surely false.
"the zone is surely not contiguous."
Will change accordingly. Thanks.
+ */
+ if (end_pfn < zone->zone_start_pfn || start_pfn > zone_end_pfn(zone))
+ return ZONE_CONTIG_NO;
+
+ /*
+ * If the moved pfn range is adjacent to the original zone span, given
+ * the moved pfn range's contiguous property is always true, the zone's
+ * contiguous property inherited from the original value.
+ */
/* Adding to the start/end of the zone will not change anything. */
Will change accordingly. Thanks.
+ if (end_pfn == zone->zone_start_pfn || start_pfn == zone_end_pfn(zone))
+ return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_NO;
+
+ /*
+ * If the original zone's hole larger than the moved pages in the range
+ * the contiguous property is surely false.
+ */
/* If we cannot fill the hole, the zone stays not contiguous. */
Will change accordingly. Thanks.
+ if (nr_pages < (zone->spanned_pages - zone->present_pages))
+ return ZONE_CONTIG_NO;
+
+ return ZONE_CONTIG_MAYBE;
+}
+
/*
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
@@ -1090,11 +1148,20 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
+ enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
if (ret)
return ret;
+ /*
+ * If the allocated memmap pages are not in a full section, keep the
+ * contiguous state as ZONE_CONTIG_NO.
+ */
+ if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
+ new_contiguous_state = zone_contig_state_after_growing(zone,
+ pfn, nr_pages);
+
This is nasty. I would wish we could just leave that code path alone.
In particular: I am 99% sure that we never ever run into this case in practice.
E.g., on x86, we can have up to 2 GiB memory blocks. But the memmap of that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB.
As commented on patch #1, we should drop the set_zone_contiguous() in this function either way and let online_pages() deal with it.
We just have to make sure that we don't create some inconsistencies by doing that.
Can you double-check?
Agreed, it is very corner
case that only when the end_pfn aligned with the
PAGES_PER_SECTION, all the memory sections will be onlined,
then the pfn_to_online_page will get non-NULL value thus the
zone contiguous has the chance to be true, otherwise the
zone contiguous will always be false. In practice it will
rarely get touched. While this optimization
relies on the previous contiguous state of the zone, in the
corner case the zone contiguous could be true, but without
set_zone_contiguous(), the zone's contiguous will remain as
false, then the fast path in the online_pages() could be
incorrect. I agree in patch #1 we can
remove the set_zone_contiguous because at that time the
online_pages did not depend on the previous contiguous
state. Now, after this optimization, this part of code seems
necessary to be kept here. One solution is to simplify
it by change the code to set_zone_contiguous(zone,
ZONE_CONTIG_MAYBE) once IS_ALIGNED(end_pfn,
PAGES_PER_SECTION), but this part of code is still in the
mhp_init_memmap_on_memory(); If we want online_pages() to
deal with this corner case, then we might need to either
pass the information to online_pages() through additional
parameter, or we need to have some code in the
online_pages() to get memory block information from pfn then
check the altmap, which seems not a good approach for the
common code path to deal with a very corner case in every
call of online_pages().
Solution1:
mhp_init_memmap_on_memory(...)
{
…
/*
* It's a corner case where all the memory section will be online,
* thus the zone contiguous state needs to be populated.
*/
if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
}
Solution2.1:
online_pages(…, nr_vmemmap_pages)
{
...
If (nr_vmemmap_pages > 0)
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
…
}
Solution2.2
oneline_pages(...)
{
...
struct memory_block *mem = pfn_to_memory_block(start_pfn);
if (mem->altmap && mem->altmap->free)
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
}
Looking forward to hearing your advice. Thanks.
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
false);
@@ -1113,7 +1180,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (nr_pages >= PAGES_PER_SECTION)
online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, new_contiguous_state);
return ret;
}
@@ -1153,6 +1220,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
const int nid = zone_to_nid(zone);
int need_zonelists_rebuild = 0;
unsigned long flags;
+ enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
int ret;
/*
@@ -1166,6 +1234,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
+ new_contiguous_state = zone_contig_state_after_growing(zone, pfn, nr_pages);
/* associate pfn range with the zone */
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
@@ -1204,7 +1273,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
}
online_pages_range(pfn, nr_pages);
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, new_contiguous_state);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
if (node_arg.nid >= 0)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fc2a6f1e518f..0c41f1004847 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2263,11 +2263,19 @@ void __init init_cma_pageblock(struct page *page)
}
#endif
-void set_zone_contiguous(struct zone *zone)
+void set_zone_contiguous(struct zone *zone, enum zone_contig_state state)
{
unsigned long block_start_pfn = zone->zone_start_pfn;
unsigned long block_end_pfn;
+ if (state == ZONE_CONTIG_YES) {
+ zone->contiguous = true;
+ return;
+ }
+
Maybe add a comment like
/* We expect an earlier call to clear_zone_contig(). */
And maybe move that comment all the way up in the function and add
VM_WARN_ON_ONCE(zone->contiguous);
Will change accordingly. Thanks.
+ if (state == ZONE_CONTIG_NO)
+ return;
+
block_end_pfn = pageblock_end_pfn(block_start_pfn);
for (; block_start_pfn < zone_end_pfn(zone);
block_start_pfn = block_end_pfn,
@@ -2348,7 +2356,7 @@ void __init page_alloc_init_late(void)
shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
/* Initialize page ext after all struct pages are initialized. */
if (deferred_struct_pages)