linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Johannes Weiner <hannes@cmpxchg.org>
To: linux-mm@kvack.org
Cc: Vlastimil Babka <vbabka@suse.cz>, Zi Yan <ziy@nvidia.com>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <ljs@kernel.org>,
	"Liam R. Howlett" <Liam.Howlett@oracle.com>,
	Rik van Riel <riel@surriel.com>,
	linux-kernel@vger.kernel.org, Johannes Weiner <jweiner@meta.com>
Subject: [RFC 1/2] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data
Date: Fri,  3 Apr 2026 15:40:34 -0400	[thread overview]
Message-ID: <20260403194526.477775-2-hannes@cmpxchg.org> (raw)
In-Reply-To: <20260403194526.477775-1-hannes@cmpxchg.org>

From: Johannes Weiner <jweiner@meta.com>

Replace the packed pageblock_flags bitmap with a per-pageblock struct
containing its own flags word. This changes the storage from
NR_PAGEBLOCK_BITS bits per pageblock packed into shared unsigned longs,
to a dedicated unsigned long per pageblock.

The free path looks up migratetype (from pageblock flags) immediately
followed by looking up pageblock ownership. Colocating them in a struct
means this hot path touches one cache line instead of two.

The per-pageblock struct also eliminates all the bit-packing indexing
(pfn_to_bitidx, word selection, intra-word shifts), simplifying the
accessor code.

Memory overhead: 8 bytes per pageblock (one unsigned long). With 2MB
pageblocks on x86_64, that's 4KB per GB -- up from ~0.5-1 bytes per
pageblock with the packed bitmap, but still negligible in absolute terms.

No functional change.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/mmzone.h | 15 ++++----
 mm/internal.h          | 17 +++++++++
 mm/mm_init.c           | 25 ++++++-------
 mm/page_alloc.c        | 81 ++++++------------------------------------
 mm/sparse.c            |  3 +-
 5 files changed, 48 insertions(+), 93 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..2f202bda5ec6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -916,7 +916,7 @@ struct zone {
 	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
 	 * In SPARSEMEM, this map is stored in struct mem_section
 	 */
-	unsigned long		*pageblock_flags;
+	struct pageblock_data	*pageblock_data;
 #endif /* CONFIG_SPARSEMEM */
 
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
@@ -1866,9 +1866,6 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
 #define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
 #define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
 
-#define SECTION_BLOCKFLAGS_BITS \
-	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-
 #if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
 #error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
 #endif
@@ -1901,13 +1898,17 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
 #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
 
+struct pageblock_data {
+	unsigned long flags;
+};
+
 struct mem_section_usage {
 	struct rcu_head rcu;
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 #endif
 	/* See declaration of similar field in struct zone */
-	unsigned long pageblock_flags[0];
+	struct pageblock_data pageblock_data[];
 };
 
 void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
@@ -1960,9 +1961,9 @@ extern struct mem_section **mem_section;
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
-static inline unsigned long *section_to_usemap(struct mem_section *ms)
+static inline struct pageblock_data *section_to_usemap(struct mem_section *ms)
 {
-	return ms->usage->pageblock_flags;
+	return ms->usage->pageblock_data;
 }
 
 static inline struct mem_section *__nr_to_section(unsigned long nr)
diff --git a/mm/internal.h b/mm/internal.h
index cb0af847d7d9..bb0e0b8a4495 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -787,6 +787,23 @@ static inline struct page *find_buddy_page_pfn(struct page *page,
 	return NULL;
 }
 
+static inline struct pageblock_data *pfn_to_pageblock(const struct page *page,
+						      unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	struct mem_section *ms = __pfn_to_section(pfn);
+	unsigned long idx = (pfn & (PAGES_PER_SECTION - 1)) >> pageblock_order;
+
+	return &section_to_usemap(ms)[idx];
+#else
+	struct zone *zone = page_zone(page);
+	unsigned long idx;
+
+	idx = (pfn - pageblock_start_pfn(zone->zone_start_pfn)) >> pageblock_order;
+	return &zone->pageblock_data[idx];
+#endif
+}
+
 extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
 				unsigned long end_pfn, struct zone *zone);
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index df34797691bd..f3751fe6e5c3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1467,36 +1467,31 @@ void __meminit init_currently_empty_zone(struct zone *zone,
 
 #ifndef CONFIG_SPARSEMEM
 /*
- * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
- * Start by making sure zonesize is a multiple of pageblock_order by rounding
- * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
- * round what is now in bits to nearest long in bits, then return it in
- * bytes.
+ * Calculate the size of the zone->pageblock_data array.
+ * Round up the zone size to a pageblock boundary to get the
+ * number of pageblocks, then multiply by the struct size.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
-	unsigned long usemapsize;
+	unsigned long nr_pageblocks;
 
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
-	usemapsize = round_up(zonesize, pageblock_nr_pages);
-	usemapsize = usemapsize >> pageblock_order;
-	usemapsize *= NR_PAGEBLOCK_BITS;
-	usemapsize = round_up(usemapsize, BITS_PER_LONG);
+	nr_pageblocks = round_up(zonesize, pageblock_nr_pages) >> pageblock_order;
 
-	return usemapsize / BITS_PER_BYTE;
+	return nr_pageblocks * sizeof(struct pageblock_data);
 }
 
 static void __ref setup_usemap(struct zone *zone)
 {
 	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
 					       zone->spanned_pages);
-	zone->pageblock_flags = NULL;
+	zone->pageblock_data = NULL;
 	if (usemapsize) {
-		zone->pageblock_flags =
+		zone->pageblock_data =
 			memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
 					    zone_to_nid(zone));
-		if (!zone->pageblock_flags)
-			panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+		if (!zone->pageblock_data)
+			panic("Failed to allocate %ld bytes for zone %s pageblock data on node %d\n",
 			      usemapsize, zone->name, zone_to_nid(zone));
 	}
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2d4b6f1a554e..900a9da2cbeb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -359,52 +359,18 @@ static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(const struct page *page,
-							unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	return section_to_usemap(__pfn_to_section(pfn));
-#else
-	return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	pfn &= (PAGES_PER_SECTION-1);
-#else
-	pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
-#endif /* CONFIG_SPARSEMEM */
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-}
-
 static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
 {
 	return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
 }
 
-static __always_inline void
-get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
-			   unsigned long **bitmap_word, unsigned long *bitidx)
+static __always_inline unsigned long *
+get_pfnblock_flags_word(const struct page *page, unsigned long pfn)
 {
-	unsigned long *bitmap;
-	unsigned long word_bitidx;
-
-#ifdef CONFIG_MEMORY_ISOLATION
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
-#else
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-#endif
 	BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
 	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
 
-	bitmap = get_pageblock_bitmap(page, pfn);
-	*bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = *bitidx / BITS_PER_LONG;
-	*bitidx &= (BITS_PER_LONG - 1);
-	*bitmap_word = &bitmap[word_bitidx];
+	return &pfn_to_pageblock(page, pfn)->flags;
 }
 
 
@@ -421,18 +387,14 @@ static unsigned long __get_pfnblock_flags_mask(const struct page *page,
 					       unsigned long pfn,
 					       unsigned long mask)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-	unsigned long word;
+	unsigned long *flags_word = get_pfnblock_flags_word(page, pfn);
 
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
 	/*
 	 * This races, without locks, with set_pfnblock_migratetype(). Ensure
 	 * a consistent read of the memory array, so that results, even though
 	 * racy, are not corrupted.
 	 */
-	word = READ_ONCE(*bitmap_word);
-	return (word >> bitidx) & mask;
+	return READ_ONCE(*flags_word) & mask;
 }
 
 /**
@@ -446,15 +408,10 @@ static unsigned long __get_pfnblock_flags_mask(const struct page *page,
 bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
 		      enum pageblock_bits pb_bit)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-
 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
 		return false;
 
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	return test_bit(bitidx + pb_bit, bitmap_word);
+	return test_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
 
 /**
@@ -493,18 +450,12 @@ get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
 static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 				      unsigned long flags, unsigned long mask)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
+	unsigned long *flags_word = get_pfnblock_flags_word(page, pfn);
 	unsigned long word;
 
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	mask <<= bitidx;
-	flags <<= bitidx;
-
-	word = READ_ONCE(*bitmap_word);
+	word = READ_ONCE(*flags_word);
 	do {
-	} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+	} while (!try_cmpxchg(flags_word, &word, (word & ~mask) | flags));
 }
 
 /**
@@ -516,15 +467,10 @@ static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 void set_pfnblock_bit(const struct page *page, unsigned long pfn,
 		      enum pageblock_bits pb_bit)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-
 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
 		return;
 
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	set_bit(bitidx + pb_bit, bitmap_word);
+	set_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
 
 /**
@@ -536,15 +482,10 @@ void set_pfnblock_bit(const struct page *page, unsigned long pfn,
 void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
 			enum pageblock_bits pb_bit)
 {
-	unsigned long *bitmap_word;
-	unsigned long bitidx;
-
 	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
 		return;
 
-	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
-
-	clear_bit(bitidx + pb_bit, bitmap_word);
+	clear_bit(pb_bit, get_pfnblock_flags_word(page, pfn));
 }
 
 /**
diff --git a/mm/sparse.c b/mm/sparse.c
index b5b2b6f7041b..c9473b9a5c24 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -298,7 +298,8 @@ static void __meminit sparse_init_one_section(struct mem_section *ms,
 
 static unsigned long usemap_size(void)
 {
-	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
+	return (1UL << (PFN_SECTION_SHIFT - pageblock_order)) *
+		sizeof(struct pageblock_data);
 }
 
 size_t mem_section_usage_size(void)
-- 
2.53.0



  reply	other threads:[~2026-04-03 19:45 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-03 19:40 [RFC 0/2] mm: page_alloc: pcp buddy allocator Johannes Weiner
2026-04-03 19:40 ` Johannes Weiner [this message]
2026-04-04  1:43   ` [RFC 1/2] mm: page_alloc: replace pageblock_flags bitmap with struct pageblock_data Rik van Riel
2026-04-03 19:40 ` [RFC 2/2] mm: page_alloc: per-cpu pageblock buddy allocator Johannes Weiner
2026-04-04  1:42   ` Rik van Riel
2026-04-06 16:12     ` Johannes Weiner
2026-04-06 17:31   ` Frank van der Linden
2026-04-06 21:58     ` Johannes Weiner
2026-04-04  2:27 ` [RFC 0/2] mm: page_alloc: pcp " Zi Yan
2026-04-06 15:24   ` Johannes Weiner
2026-04-07  2:42     ` Zi Yan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260403194526.477775-2-hannes@cmpxchg.org \
    --to=hannes@cmpxchg.org \
    --cc=Liam.Howlett@oracle.com \
    --cc=david@kernel.org \
    --cc=jweiner@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=riel@surriel.com \
    --cc=vbabka@suse.cz \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox