linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
@ 2026-03-05  7:30 zenghongling
  2026-03-27  5:16 ` Andrew Morton
  0 siblings, 1 reply; 5+ messages in thread
From: zenghongling @ 2026-03-05  7:30 UTC (permalink / raw)
  To: dennis, tj, cl, akpm; +Cc: linux-mm, zhongling0719, zenghongling

The pcpu_chunk_struct has a hole of 4 bytes and pushes the struct to three
cachelines.  Relocating the three booleans upwards allows for the struct
to only use two cachelines.

Before:
struct pcpu_chunk {
        struct list_head           list;                 /*     0    16 */
        int                        free_bytes;           /*    16     4 */
        struct pcpu_block_md       chunk_md;             /*    20    32 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int *        bound_map;            /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        void *                     base_addr __attribute__((__aligned__(64))); /*    64     8 */
        long unsigned int *        alloc_map;            /*    72     8 */
        struct pcpu_block_md *     md_blocks;            /*    80     8 */
        void *                     data;                 /*    88     8 */
        bool                       immutable;            /*    96     1 */
        bool                       isolated;             /*    97     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        start_offset;         /*   100     4 */
        int                        end_offset;           /*   104     4 */

        /* XXX 4 bytes hole, try to pack */

        struct obj_cgroup * *      obj_cgroups;          /*   112     8 */
        int                        nr_pages;             /*   120     4 */
        int                        nr_populated;         /*   124     4 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        int                        nr_empty_pop_pages;   /*   128     4 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int          populated[];          /*   136     0 */

        /* size: 192, cachelines: 3, members: 17 */
        /* sum members: 122, holes: 4, sum holes: 14 */
        /* padding: 56 */
        /* forced alignments: 1 */
} __attribute__((__aligned__(64)));

After:
struct pcpu_chunk {
        struct list_head           list;                 /*     0    16 */
        int                        free_bytes;           /*    16     4 */
        struct pcpu_block_md       chunk_md;             /*    20    32 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int *        bound_map;            /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        void *                     base_addr __attribute__((__aligned__(64))); /*    64     8 */
        long unsigned int *        alloc_map;            /*    72     8 */
        struct pcpu_block_md *     md_blocks;            /*    80     8 */
        void *                     data;                 /*    88     8 */
        bool                       immutable;            /*    96     1 */
        bool                       isolated;             /*    97     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        start_offset;         /*   100     4 */
        int                        end_offset;           /*   104     4 */
        int                        nr_empty_pop_pages;   /*   108     4 */
        struct obj_cgroup * *      obj_cgroups;          /*   112     8 */
        int                        nr_pages;             /*   120     4 */
        int                        nr_populated;         /*   124     4 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        long unsigned int          populated[];          /*   128     0 */

        /* size: 128, cachelines: 2, members: 17 */
        /* sum members: 122, holes: 2, sum holes: 6 */
        /* forced alignments: 1 */
} __attribute__((__aligned__(64)));

Signed-off-by: zenghongling <zenghongling@kylinos.cn>
---
 mm/percpu-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 4b3d6ec43703..26f3ac39f8c3 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -77,13 +77,13 @@ struct pcpu_chunk {
 	int			end_offset;	/* additional area required to
 						   have the region end page
 						   aligned */
+	int                     nr_empty_pop_pages; /* # of empty populated pages */
 #ifdef NEED_PCPUOBJ_EXT
 	struct pcpuobj_ext	*obj_exts;	/* vector of object cgroups */
 #endif
 
 	int			nr_pages;	/* # of pages served by this chunk */
 	int			nr_populated;	/* # of populated pages */
-	int                     nr_empty_pop_pages; /* # of empty populated pages */
 	unsigned long		populated[];	/* populated bitmap */
 };
 
-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
  2026-03-05  7:30 [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory zenghongling
@ 2026-03-27  5:16 ` Andrew Morton
  2026-03-27  7:08   ` dd
  2026-03-27  7:32   ` Dennis Zhou
  0 siblings, 2 replies; 5+ messages in thread
From: Andrew Morton @ 2026-03-27  5:16 UTC (permalink / raw)
  To: zenghongling; +Cc: dennis, tj, cl, linux-mm, zhongling0719

On Thu,  5 Mar 2026 15:30:43 +0800 zenghongling <zenghongling@kylinos.cn> wrote:

> Subject: [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory

There is no pcpu_chunk_struct,  I'll change this to "struct pcpu_chunk".

> Date: Thu,  5 Mar 2026 15:30:43 +0800
> Sender: owner-linux-mm@kvack.org
> X-Mailer: git-send-email 2.25.1
> 
> The pcpu_chunk_struct has a hole of 4 bytes and pushes the struct to three
> cachelines.  Relocating the three booleans upwards allows for the struct
> to only use two cachelines.

before:

(gdb) p sizeof(struct pcpu_chunk)
$2 = 256

after:

(gdb) p sizeof(struct pcpu_chunk)
$1 = 192

That's remarkable.  It was an allmodconfig build which tends to add
bloat, but the only source-level alteration was this patch.


Another consideration here is that moving members around can have a
performance impact - it can cause more (or less) cacheline
invalidations.  I worry that because someone has carefully commented
all the member offsets, this might have been a consideration.

Also I think your patch may have made those comments incorrect?


Dennis, Tejun, Christoph: I think we want this space saving.  Can
you please advise?

Thanks.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re:Re: [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
  2026-03-27  5:16 ` Andrew Morton
@ 2026-03-27  7:08   ` dd
  2026-03-27  7:32   ` Dennis Zhou
  1 sibling, 0 replies; 5+ messages in thread
From: dd @ 2026-03-27  7:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: zenghongling, dennis, tj, cl, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1616 bytes --]

Hi:
      You may refer to the pahole test results before and after the modification. The offsets of other members remain unchanged. Originally, nr_empty_pop_pageswas placed alone at the beginning of the third cache line. 
After the modification, it has been moved to the end of the second cache line.


Thanks.
At 2026-03-27 13:16:41, "Andrew Morton" <akpm@linux-foundation.org> wrote:
>On Thu,  5 Mar 2026 15:30:43 +0800 zenghongling <zenghongling@kylinos.cn> wrote:
>
>> Subject: [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
>
>There is no pcpu_chunk_struct,  I'll change this to "struct pcpu_chunk".
>
>> Date: Thu,  5 Mar 2026 15:30:43 +0800
>> Sender: owner-linux-mm@kvack.org
>> X-Mailer: git-send-email 2.25.1
>> 
>> The pcpu_chunk_struct has a hole of 4 bytes and pushes the struct to three
>> cachelines.  Relocating the three booleans upwards allows for the struct
>> to only use two cachelines.
>
>before:
>
>(gdb) p sizeof(struct pcpu_chunk)
>$2 = 256
>
>after:
>
>(gdb) p sizeof(struct pcpu_chunk)
>$1 = 192
>
>That's remarkable.  It was an allmodconfig build which tends to add
>bloat, but the only source-level alteration was this patch.
>
>
>Another consideration here is that moving members around can have a
>performance impact - it can cause more (or less) cacheline
>invalidations.  I worry that because someone has carefully commented
>all the member offsets, this might have been a consideration.
>
>Also I think your patch may have made those comments incorrect?
>
>
>Dennis, Tejun, Christoph: I think we want this space saving.  Can
>you please advise?
>
>Thanks.

[-- Attachment #2: Type: text/html, Size: 2153 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
  2026-03-27  5:16 ` Andrew Morton
  2026-03-27  7:08   ` dd
@ 2026-03-27  7:32   ` Dennis Zhou
  1 sibling, 0 replies; 5+ messages in thread
From: Dennis Zhou @ 2026-03-27  7:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: zenghongling, dennis, tj, cl, linux-mm, zhongling0719

Hello,

On Thu, Mar 26, 2026 at 10:16:41PM -0700, Andrew Morton wrote:
> On Thu,  5 Mar 2026 15:30:43 +0800 zenghongling <zenghongling@kylinos.cn> wrote:
> 
> > Subject: [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
> 
> There is no pcpu_chunk_struct,  I'll change this to "struct pcpu_chunk".
> 
> > Date: Thu,  5 Mar 2026 15:30:43 +0800
> > Sender: owner-linux-mm@kvack.org
> > X-Mailer: git-send-email 2.25.1
> > 
> > The pcpu_chunk_struct has a hole of 4 bytes and pushes the struct to three
> > cachelines.  Relocating the three booleans upwards allows for the struct
> > to only use two cachelines.
> 

Wait, this patch moves `int nr_empty_pop_pages` not 3 booleans?

> before:
> 
> (gdb) p sizeof(struct pcpu_chunk)
> $2 = 256
> 
> after:
> 
> (gdb) p sizeof(struct pcpu_chunk)
> $1 = 192
> 
> That's remarkable.  It was an allmodconfig build which tends to add
> bloat, but the only source-level alteration was this patch.
> 

There is also still PERCPU_STATS at the beginning of the chunk depending
on config, defaults no.

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                     nr_alloc;       /* # of allocations */
        size_t                  max_alloc_size; /* largest allocation size */
#endif

In my experience this is an order 10s - 100 # of chunks. So this would
save give or take like 4k.

> 
> Another consideration here is that moving members around can have a
> performance impact - it can cause more (or less) cacheline
> invalidations.  I worry that because someone has carefully commented
> all the member offsets, this might have been a consideration.
> 

I'd probably move nr_pages over nr_empty_pop_pages as if you're touching
the populated[] array, you're more likely to touch nr_empty_pop_pages.

> Also I think your patch may have made those comments incorrect?
> 
> 
> Dennis, Tejun, Christoph: I think we want this space saving.  Can
> you please advise?
> 

I don't think it hurts. I'd move nr_pages over nr_empty_pop_pages.

Thanks,
Dennis


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory
@ 2026-03-05  7:26 zenghongling
  0 siblings, 0 replies; 5+ messages in thread
From: zenghongling @ 2026-03-05  7:26 UTC (permalink / raw)
  To: dennis, tj, cl, akpm; +Cc: linux-mm, kernel, zhongling0719, zenghongling

The pcpu_chunk_struct has a hole of 4 bytes and pushes the struct to three
cachelines.  Relocating the three booleans upwards allows for the struct
to only use two cachelines.

Before:
struct pcpu_chunk {
        struct list_head           list;                 /*     0    16 */
        int                        free_bytes;           /*    16     4 */
        struct pcpu_block_md       chunk_md;             /*    20    32 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int *        bound_map;            /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        void *                     base_addr __attribute__((__aligned__(64))); /*    64     8 */
        long unsigned int *        alloc_map;            /*    72     8 */
        struct pcpu_block_md *     md_blocks;            /*    80     8 */
        void *                     data;                 /*    88     8 */
        bool                       immutable;            /*    96     1 */
        bool                       isolated;             /*    97     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        start_offset;         /*   100     4 */
        int                        end_offset;           /*   104     4 */

        /* XXX 4 bytes hole, try to pack */

        struct obj_cgroup * *      obj_cgroups;          /*   112     8 */
        int                        nr_pages;             /*   120     4 */
        int                        nr_populated;         /*   124     4 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        int                        nr_empty_pop_pages;   /*   128     4 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int          populated[];          /*   136     0 */

        /* size: 192, cachelines: 3, members: 17 */
        /* sum members: 122, holes: 4, sum holes: 14 */
        /* padding: 56 */
        /* forced alignments: 1 */
} __attribute__((__aligned__(64)));

After:
struct pcpu_chunk {
        struct list_head           list;                 /*     0    16 */
        int                        free_bytes;           /*    16     4 */
        struct pcpu_block_md       chunk_md;             /*    20    32 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int *        bound_map;            /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        void *                     base_addr __attribute__((__aligned__(64))); /*    64     8 */
        long unsigned int *        alloc_map;            /*    72     8 */
        struct pcpu_block_md *     md_blocks;            /*    80     8 */
        void *                     data;                 /*    88     8 */
        bool                       immutable;            /*    96     1 */
        bool                       isolated;             /*    97     1 */

        /* XXX 2 bytes hole, try to pack */

        int                        start_offset;         /*   100     4 */
        int                        end_offset;           /*   104     4 */
        int                        nr_empty_pop_pages;   /*   108     4 */
        struct obj_cgroup * *      obj_cgroups;          /*   112     8 */
        int                        nr_pages;             /*   120     4 */
        int                        nr_populated;         /*   124     4 */
        /* --- cacheline 2 boundary (128 bytes) --- */
        long unsigned int          populated[];          /*   128     0 */

        /* size: 128, cachelines: 2, members: 17 */
        /* sum members: 122, holes: 2, sum holes: 6 */
        /* forced alignments: 1 */
} __attribute__((__aligned__(64)));

Signed-off-by: zenghongling <zenghongling@kylinos.cn>
---
 mm/percpu-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 4b3d6ec43703..26f3ac39f8c3 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -77,13 +77,13 @@ struct pcpu_chunk {
 	int			end_offset;	/* additional area required to
 						   have the region end page
 						   aligned */
+	int                     nr_empty_pop_pages; /* # of empty populated pages */
 #ifdef NEED_PCPUOBJ_EXT
 	struct pcpuobj_ext	*obj_exts;	/* vector of object cgroups */
 #endif
 
 	int			nr_pages;	/* # of pages served by this chunk */
 	int			nr_populated;	/* # of populated pages */
-	int                     nr_empty_pop_pages; /* # of empty populated pages */
 	unsigned long		populated[];	/* populated bitmap */
 };
 
-- 
2.25.1



^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-03-27  7:32 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-03-05  7:30 [PATCH] mm/percpu-internal.h: optimise pcpu_chunk_struct to save memory zenghongling
2026-03-27  5:16 ` Andrew Morton
2026-03-27  7:08   ` dd
2026-03-27  7:32   ` Dennis Zhou
  -- strict thread matches above, loose matches on Subject: below --
2026-03-05  7:26 zenghongling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox