* [PATCH] mm: memcontrol: account kernel stack per node
@ 2020-06-30 0:44 Shakeel Butt
2020-06-30 3:23 ` Roman Gushchin
0 siblings, 1 reply; 3+ messages in thread
From: Shakeel Butt @ 2020-06-30 0:44 UTC (permalink / raw)
To: Johannes Weiner, Roman Gushchin, Michal Hocko
Cc: Andrew Morton, linux-mm, linux-kernel, Shakeel Butt
Currently the kernel stack is being accounted per-zone. There is no need
to do that. In addition due to being per-zone, memcg has to keep a
separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate
MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of
node_stat_item.
Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
drivers/base/node.c | 4 ++--
fs/proc/meminfo.c | 4 ++--
include/linux/memcontrol.h | 2 --
include/linux/mmzone.h | 8 ++++----
kernel/fork.c | 29 ++++++++++-------------------
kernel/scs.c | 2 +-
mm/memcontrol.c | 2 +-
mm/page_alloc.c | 16 ++++++++--------
mm/vmstat.c | 8 ++++----
9 files changed, 32 insertions(+), 43 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0cf13e31603c..508b80f6329b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
nid, K(i.sharedram),
- nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
+ nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
#ifdef CONFIG_SHADOW_CALL_STACK
- nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB),
+ nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
nid, 0UL,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index f262bff3ca31..887a5532e449 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SReclaimable: ", sreclaimable);
show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
- global_zone_page_state(NR_KERNEL_STACK_KB));
+ global_node_page_state(NR_KERNEL_STACK_KB));
#ifdef CONFIG_SHADOW_CALL_STACK
seq_printf(m, "ShadowCallStack:%8lu kB\n",
- global_zone_page_state(NR_KERNEL_SCS_KB));
+ global_node_page_state(NR_KERNEL_SCS_KB));
#endif
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ba1e42715ecf..a3ddb236898e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -33,8 +33,6 @@ enum memcg_stat_item {
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
MEMCG_PERCPU_B,
- /* XXX: why are these zone and not node counters? */
- MEMCG_KERNEL_STACK_KB,
MEMCG_NR_STAT,
};
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8e859444927a..b79f73ce8b57 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -153,10 +153,6 @@ enum zone_stat_item {
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
NR_PAGETABLE, /* used for pagetables */
- NR_KERNEL_STACK_KB, /* measured in KiB */
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- NR_KERNEL_SCS_KB, /* measured in KiB */
-#endif
/* Second 128 byte cacheline */
NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
@@ -201,6 +197,10 @@ enum node_stat_item {
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
+ NR_KERNEL_STACK_KB, /* measured in KiB */
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ NR_KERNEL_SCS_KB, /* measured in KiB */
+#endif
NR_VM_NODE_STAT_ITEMS
};
diff --git a/kernel/fork.c b/kernel/fork.c
index 73fdfa9674b5..ee5393350ef7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -278,7 +278,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
+ NR_KERNEL_STACK_KB,
-(int)(PAGE_SIZE / 1024));
memcg_kmem_uncharge_page(vm->pages[i], 0);
@@ -381,32 +381,23 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
{
void *stack = task_stack_page(tsk);
struct vm_struct *vm = task_stack_vm_area(tsk);
+ struct page *page;
BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
if (vm) {
- int i;
-
BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+ page = vm->pages[0];
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- mod_zone_page_state(page_zone(vm->pages[i]),
- NR_KERNEL_STACK_KB,
- PAGE_SIZE / 1024 * account);
- }
} else {
- /*
- * All stack pages are in the same zone and belong to the
- * same memcg.
- */
- struct page *first_page = virt_to_page(stack);
-
- mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
- THREAD_SIZE / 1024 * account);
-
- mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
+ page = virt_to_page(stack);
+ mod_memcg_obj_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
+
+ /* All stack pages are in the same node. */
+ mod_node_page_state(page_pgdat(page), NR_KERNEL_STACK_KB,
+ THREAD_SIZE / 1024 * account);
}
static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -431,7 +422,7 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
return ret;
mod_memcg_page_state(vm->pages[i],
- MEMCG_KERNEL_STACK_KB,
+ NR_KERNEL_STACK_KB,
PAGE_SIZE / 1024);
}
}
diff --git a/kernel/scs.c b/kernel/scs.c
index 5d4d9bbdec36..4ff4a7ba0094 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
{
struct page *scs_page = virt_to_page(s);
- mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
+ mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
account * (SCS_SIZE / SZ_1K));
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b1a644224383..06de63901f81 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1485,7 +1485,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
(u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 174c849ba9f2..0568b126f719 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5402,6 +5402,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5423,6 +5427,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -5454,10 +5462,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
@@ -5479,10 +5483,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- zone_page_state(zone, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- zone_page_state(zone, NR_KERNEL_SCS_KB),
-#endif
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2c5a96694490..96bf8bfffd1d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1158,10 +1158,6 @@ const char * const vmstat_text[] = {
"nr_zone_write_pending",
"nr_mlock",
"nr_page_table_pages",
- "nr_kernel_stack",
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- "nr_shadow_call_stack",
-#endif
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
@@ -1212,6 +1208,10 @@ const char * const vmstat_text[] = {
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
+ "nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ "nr_shadow_call_stack",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
--
2.27.0.212.ge8ba1cc988-goog
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] mm: memcontrol: account kernel stack per node
2020-06-30 0:44 [PATCH] mm: memcontrol: account kernel stack per node Shakeel Butt
@ 2020-06-30 3:23 ` Roman Gushchin
2020-06-30 14:34 ` Shakeel Butt
0 siblings, 1 reply; 3+ messages in thread
From: Roman Gushchin @ 2020-06-30 3:23 UTC (permalink / raw)
To: Shakeel Butt
Cc: Johannes Weiner, Michal Hocko, Andrew Morton, linux-mm, linux-kernel
On Mon, Jun 29, 2020 at 05:44:13PM -0700, Shakeel Butt wrote:
> Currently the kernel stack is being accounted per-zone. There is no need
> to do that. In addition due to being per-zone, memcg has to keep a
> separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate
> MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of
> node_stat_item.
>
> Signed-off-by: Shakeel Butt <shakeelb@google.com>
> ---
> drivers/base/node.c | 4 ++--
> fs/proc/meminfo.c | 4 ++--
> include/linux/memcontrol.h | 2 --
> include/linux/mmzone.h | 8 ++++----
> kernel/fork.c | 29 ++++++++++-------------------
> kernel/scs.c | 2 +-
> mm/memcontrol.c | 2 +-
> mm/page_alloc.c | 16 ++++++++--------
> mm/vmstat.c | 8 ++++----
> 9 files changed, 32 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 0cf13e31603c..508b80f6329b 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev,
> nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
> nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
> nid, K(i.sharedram),
> - nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
> + nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
> #ifdef CONFIG_SHADOW_CALL_STACK
> - nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB),
> + nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
> #endif
> nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
> nid, 0UL,
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index f262bff3ca31..887a5532e449 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "SReclaimable: ", sreclaimable);
> show_val_kb(m, "SUnreclaim: ", sunreclaim);
> seq_printf(m, "KernelStack: %8lu kB\n",
> - global_zone_page_state(NR_KERNEL_STACK_KB));
> + global_node_page_state(NR_KERNEL_STACK_KB));
> #ifdef CONFIG_SHADOW_CALL_STACK
> seq_printf(m, "ShadowCallStack:%8lu kB\n",
> - global_zone_page_state(NR_KERNEL_SCS_KB));
> + global_node_page_state(NR_KERNEL_SCS_KB));
> #endif
> show_val_kb(m, "PageTables: ",
> global_zone_page_state(NR_PAGETABLE));
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index ba1e42715ecf..a3ddb236898e 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -33,8 +33,6 @@ enum memcg_stat_item {
> MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
> MEMCG_SOCK,
> MEMCG_PERCPU_B,
> - /* XXX: why are these zone and not node counters? */
> - MEMCG_KERNEL_STACK_KB,
> MEMCG_NR_STAT,
> };
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 8e859444927a..b79f73ce8b57 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -153,10 +153,6 @@ enum zone_stat_item {
> NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
> NR_MLOCK, /* mlock()ed pages found and moved off LRU */
> NR_PAGETABLE, /* used for pagetables */
> - NR_KERNEL_STACK_KB, /* measured in KiB */
> -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
> - NR_KERNEL_SCS_KB, /* measured in KiB */
> -#endif
> /* Second 128 byte cacheline */
> NR_BOUNCE,
> #if IS_ENABLED(CONFIG_ZSMALLOC)
> @@ -201,6 +197,10 @@ enum node_stat_item {
> NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
> NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
> NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
> + NR_KERNEL_STACK_KB, /* measured in KiB */
> +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
> + NR_KERNEL_SCS_KB, /* measured in KiB */
> +#endif
> NR_VM_NODE_STAT_ITEMS
> };
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 73fdfa9674b5..ee5393350ef7 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -278,7 +278,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
>
> for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> mod_memcg_page_state(vm->pages[i],
> - MEMCG_KERNEL_STACK_KB,
> + NR_KERNEL_STACK_KB,
> -(int)(PAGE_SIZE / 1024));
Hello, Shakeel!
Thank you for the cleanup, it makes total sense to me.
However I have some concerns: mod_memcg_page_state() does change only memcg's counters,
but not lruvec counters. So to make it per-node per-memcg (aka lruvec)
we need to use mod_lruvec_state(), otherwise we won't have global per-node values.
>
> memcg_kmem_uncharge_page(vm->pages[i], 0);
> @@ -381,32 +381,23 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
> {
> void *stack = task_stack_page(tsk);
> struct vm_struct *vm = task_stack_vm_area(tsk);
> + struct page *page;
>
> BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
>
> if (vm) {
> - int i;
> -
> BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
> + page = vm->pages[0];
>
> - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> - mod_zone_page_state(page_zone(vm->pages[i]),
> - NR_KERNEL_STACK_KB,
> - PAGE_SIZE / 1024 * account);
> - }
> } else {
> - /*
> - * All stack pages are in the same zone and belong to the
> - * same memcg.
> - */
> - struct page *first_page = virt_to_page(stack);
> -
> - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
> - THREAD_SIZE / 1024 * account);
> -
> - mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
> + page = virt_to_page(stack);
> + mod_memcg_obj_state(stack, NR_KERNEL_STACK_KB,
> account * (THREAD_SIZE / 1024));
> }
> +
> + /* All stack pages are in the same node. */
> + mod_node_page_state(page_pgdat(page), NR_KERNEL_STACK_KB,
> + THREAD_SIZE / 1024 * account);
> }
And then we probably don't need a separate change for memcg- and per-node counters.
Thanks!
>
> static int memcg_charge_kernel_stack(struct task_struct *tsk)
> @@ -431,7 +422,7 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
> return ret;
>
> mod_memcg_page_state(vm->pages[i],
> - MEMCG_KERNEL_STACK_KB,
> + NR_KERNEL_STACK_KB,
> PAGE_SIZE / 1024);
> }
> }
> diff --git a/kernel/scs.c b/kernel/scs.c
> index 5d4d9bbdec36..4ff4a7ba0094 100644
> --- a/kernel/scs.c
> +++ b/kernel/scs.c
> @@ -17,7 +17,7 @@ static void __scs_account(void *s, int account)
> {
> struct page *scs_page = virt_to_page(s);
>
> - mod_zone_page_state(page_zone(scs_page), NR_KERNEL_SCS_KB,
> + mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
> account * (SCS_SIZE / SZ_1K));
> }
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b1a644224383..06de63901f81 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1485,7 +1485,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
> (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
> PAGE_SIZE);
> seq_buf_printf(&s, "kernel_stack %llu\n",
> - (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
> + (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
> 1024);
> seq_buf_printf(&s, "slab %llu\n",
> (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 174c849ba9f2..0568b126f719 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5402,6 +5402,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> " anon_thp: %lukB"
> #endif
> " writeback_tmp:%lukB"
> + " kernel_stack:%lukB"
> +#ifdef CONFIG_SHADOW_CALL_STACK
> + " shadow_call_stack:%lukB"
> +#endif
> " all_unreclaimable? %s"
> "\n",
> pgdat->node_id,
> @@ -5423,6 +5427,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
> #endif
> K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
> + node_page_state(pgdat, NR_KERNEL_STACK_KB),
> +#ifdef CONFIG_SHADOW_CALL_STACK
> + node_page_state(pgdat, NR_KERNEL_SCS_KB),
> +#endif
> pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
> "yes" : "no");
> }
> @@ -5454,10 +5462,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> " present:%lukB"
> " managed:%lukB"
> " mlocked:%lukB"
> - " kernel_stack:%lukB"
> -#ifdef CONFIG_SHADOW_CALL_STACK
> - " shadow_call_stack:%lukB"
> -#endif
> " pagetables:%lukB"
> " bounce:%lukB"
> " free_pcp:%lukB"
> @@ -5479,10 +5483,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
> K(zone->present_pages),
> K(zone_managed_pages(zone)),
> K(zone_page_state(zone, NR_MLOCK)),
> - zone_page_state(zone, NR_KERNEL_STACK_KB),
> -#ifdef CONFIG_SHADOW_CALL_STACK
> - zone_page_state(zone, NR_KERNEL_SCS_KB),
> -#endif
> K(zone_page_state(zone, NR_PAGETABLE)),
> K(zone_page_state(zone, NR_BOUNCE)),
> K(free_pcp),
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 2c5a96694490..96bf8bfffd1d 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1158,10 +1158,6 @@ const char * const vmstat_text[] = {
> "nr_zone_write_pending",
> "nr_mlock",
> "nr_page_table_pages",
> - "nr_kernel_stack",
> -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
> - "nr_shadow_call_stack",
> -#endif
> "nr_bounce",
> #if IS_ENABLED(CONFIG_ZSMALLOC)
> "nr_zspages",
> @@ -1212,6 +1208,10 @@ const char * const vmstat_text[] = {
> "nr_kernel_misc_reclaimable",
> "nr_foll_pin_acquired",
> "nr_foll_pin_released",
> + "nr_kernel_stack",
> +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
> + "nr_shadow_call_stack",
> +#endif
>
> /* enum writeback_stat_item counters */
> "nr_dirty_threshold",
> --
> 2.27.0.212.ge8ba1cc988-goog
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] mm: memcontrol: account kernel stack per node
2020-06-30 3:23 ` Roman Gushchin
@ 2020-06-30 14:34 ` Shakeel Butt
0 siblings, 0 replies; 3+ messages in thread
From: Shakeel Butt @ 2020-06-30 14:34 UTC (permalink / raw)
To: Roman Gushchin
Cc: Johannes Weiner, Michal Hocko, Andrew Morton, Linux MM, LKML
On Mon, Jun 29, 2020 at 8:24 PM Roman Gushchin <guro@fb.com> wrote:
>
> On Mon, Jun 29, 2020 at 05:44:13PM -0700, Shakeel Butt wrote:
> > Currently the kernel stack is being accounted per-zone. There is no need
> > to do that. In addition due to being per-zone, memcg has to keep a
> > separate MEMCG_KERNEL_STACK_KB. Make the stat per-node and deprecate
> > MEMCG_KERNEL_STACK_KB as memcg_stat_item is an extension of
> > node_stat_item.
> >
> > Signed-off-by: Shakeel Butt <shakeelb@google.com>
> > ---
> > drivers/base/node.c | 4 ++--
> > fs/proc/meminfo.c | 4 ++--
> > include/linux/memcontrol.h | 2 --
> > include/linux/mmzone.h | 8 ++++----
> > kernel/fork.c | 29 ++++++++++-------------------
> > kernel/scs.c | 2 +-
> > mm/memcontrol.c | 2 +-
> > mm/page_alloc.c | 16 ++++++++--------
> > mm/vmstat.c | 8 ++++----
> > 9 files changed, 32 insertions(+), 43 deletions(-)
> >
> > diff --git a/drivers/base/node.c b/drivers/base/node.c
> > index 0cf13e31603c..508b80f6329b 100644
> > --- a/drivers/base/node.c
> > +++ b/drivers/base/node.c
> > @@ -440,9 +440,9 @@ static ssize_t node_read_meminfo(struct device *dev,
> > nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
> > nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
> > nid, K(i.sharedram),
> > - nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
> > + nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
> > #ifdef CONFIG_SHADOW_CALL_STACK
> > - nid, sum_zone_node_page_state(nid, NR_KERNEL_SCS_KB),
> > + nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
> > #endif
> > nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
> > nid, 0UL,
> > diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> > index f262bff3ca31..887a5532e449 100644
> > --- a/fs/proc/meminfo.c
> > +++ b/fs/proc/meminfo.c
> > @@ -101,10 +101,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> > show_val_kb(m, "SReclaimable: ", sreclaimable);
> > show_val_kb(m, "SUnreclaim: ", sunreclaim);
> > seq_printf(m, "KernelStack: %8lu kB\n",
> > - global_zone_page_state(NR_KERNEL_STACK_KB));
> > + global_node_page_state(NR_KERNEL_STACK_KB));
> > #ifdef CONFIG_SHADOW_CALL_STACK
> > seq_printf(m, "ShadowCallStack:%8lu kB\n",
> > - global_zone_page_state(NR_KERNEL_SCS_KB));
> > + global_node_page_state(NR_KERNEL_SCS_KB));
> > #endif
> > show_val_kb(m, "PageTables: ",
> > global_zone_page_state(NR_PAGETABLE));
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index ba1e42715ecf..a3ddb236898e 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -33,8 +33,6 @@ enum memcg_stat_item {
> > MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
> > MEMCG_SOCK,
> > MEMCG_PERCPU_B,
> > - /* XXX: why are these zone and not node counters? */
> > - MEMCG_KERNEL_STACK_KB,
> > MEMCG_NR_STAT,
> > };
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 8e859444927a..b79f73ce8b57 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -153,10 +153,6 @@ enum zone_stat_item {
> > NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
> > NR_MLOCK, /* mlock()ed pages found and moved off LRU */
> > NR_PAGETABLE, /* used for pagetables */
> > - NR_KERNEL_STACK_KB, /* measured in KiB */
> > -#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
> > - NR_KERNEL_SCS_KB, /* measured in KiB */
> > -#endif
> > /* Second 128 byte cacheline */
> > NR_BOUNCE,
> > #if IS_ENABLED(CONFIG_ZSMALLOC)
> > @@ -201,6 +197,10 @@ enum node_stat_item {
> > NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
> > NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */
> > NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */
> > + NR_KERNEL_STACK_KB, /* measured in KiB */
> > +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
> > + NR_KERNEL_SCS_KB, /* measured in KiB */
> > +#endif
> > NR_VM_NODE_STAT_ITEMS
> > };
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 73fdfa9674b5..ee5393350ef7 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -278,7 +278,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
> >
> > for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> > mod_memcg_page_state(vm->pages[i],
> > - MEMCG_KERNEL_STACK_KB,
> > + NR_KERNEL_STACK_KB,
> > -(int)(PAGE_SIZE / 1024));
>
> Hello, Shakeel!
>
> Thank you for the cleanup, it makes total sense to me.
>
> However I have some concerns: mod_memcg_page_state() does change only memcg's counters,
> but not lruvec counters. So to make it per-node per-memcg (aka lruvec)
> we need to use mod_lruvec_state(), otherwise we won't have global per-node values.
>
> >
> > memcg_kmem_uncharge_page(vm->pages[i], 0);
> > @@ -381,32 +381,23 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
> > {
> > void *stack = task_stack_page(tsk);
> > struct vm_struct *vm = task_stack_vm_area(tsk);
> > + struct page *page;
> >
> > BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
> >
> > if (vm) {
> > - int i;
> > -
> > BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
> > + page = vm->pages[0];
> >
> > - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> > - mod_zone_page_state(page_zone(vm->pages[i]),
> > - NR_KERNEL_STACK_KB,
> > - PAGE_SIZE / 1024 * account);
> > - }
> > } else {
> > - /*
> > - * All stack pages are in the same zone and belong to the
> > - * same memcg.
> > - */
> > - struct page *first_page = virt_to_page(stack);
> > -
> > - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
> > - THREAD_SIZE / 1024 * account);
> > -
> > - mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
> > + page = virt_to_page(stack);
> > + mod_memcg_obj_state(stack, NR_KERNEL_STACK_KB,
> > account * (THREAD_SIZE / 1024));
> > }
> > +
> > + /* All stack pages are in the same node. */
> > + mod_node_page_state(page_pgdat(page), NR_KERNEL_STACK_KB,
> > + THREAD_SIZE / 1024 * account);
> > }
>
> And then we probably don't need a separate change for memcg- and per-node counters.
>
Yes, I thought about combining memcg and per-node counters but got
worried that the cached stacks for CONFIG_VMAP_STACK would not be
accounted for in the per-node global counters but I see that we
already don't account for them in both counters. I will further
simplify these. Thanks for the suggestion.
Shakeel
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-06-30 14:34 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-30 0:44 [PATCH] mm: memcontrol: account kernel stack per node Shakeel Butt
2020-06-30 3:23 ` Roman Gushchin
2020-06-30 14:34 ` Shakeel Butt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox