[PATCH v3] sched/numa: add statistics of numa balance task migration

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3] sched/numa: add statistics of numa balance task migration
@ 2025-04-30 10:36 Chen Yu
  2025-05-01  7:00 ` Libo Chen
  2025-05-05  6:43 ` Jain, Ayush
  0 siblings, 2 replies; 15+ messages in thread
From: Chen Yu @ 2025-04-30 10:36 UTC (permalink / raw)
  To: Peter Zijlstra, Andrew Morton
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, Libo Chen,
	cgroups, linux-doc, linux-mm, linux-kernel, Chen Yu,
	K Prateek Nayak, Madadi Vineeth Reddy

On systems with NUMA balancing enabled, it is found that tracking
the task activities due to NUMA balancing is helpful. NUMA balancing
has two mechanisms for task migration: one is to migrate the task to
an idle CPU in its preferred node, the other is to swap tasks on
different nodes if they are on each other's preferred node.

The kernel already has NUMA page migration statistics in
/sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched,
but does not have statistics for task migration/swap.
Add the task migration and swap count accordingly.

The following two new fields:

numa_task_migrated
numa_task_swapped

will be displayed in both
/sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched

Introducing both pertask and permemcg NUMA balancing statistics helps
to quickly evaluate the performance and resource usage of the target
workload. For example, the user can first identify the container which
has high NUMA balance activity and then narrow down to a specific task
within that group, and tune the memory policy of that task.
In summary, it is plausible to iterate the /proc/$pid/sched to find the
offending task, but the introduction of per memcg tasks' Numa balancing
aggregated  activity can further help users identify the task in a
divide-and-conquer way.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
---
v2->v3:
Remove unnecessary p->mm check because kernel threads are
not supported by Numa Balancing. (Libo Chen)
v1->v2:
Update the Documentation/admin-guide/cgroup-v2.rst. (Michal)
---
 Documentation/admin-guide/cgroup-v2.rst | 6 ++++++
 include/linux/sched.h                   | 4 ++++
 include/linux/vm_event_item.h           | 2 ++
 kernel/sched/core.c                     | 7 +++++--
 kernel/sched/debug.c                    | 4 ++++
 mm/memcontrol.c                         | 2 ++
 mm/vmstat.c                             | 2 ++
 7 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1a16ce68a4d7..d346f3235945 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1670,6 +1670,12 @@ The following nested keys are defined.
 	  numa_hint_faults (npn)
 		Number of NUMA hinting faults.
 
+	  numa_task_migrated (npn)
+		Number of task migration by NUMA balancing.
+
+	  numa_task_swapped (npn)
+		Number of task swap by NUMA balancing.
+
 	  pgdemote_kswapd
 		Number of pages demoted by kswapd.
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f96ac1982893..1c50e30b5c01 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -549,6 +549,10 @@ struct sched_statistics {
 	u64				nr_failed_migrations_running;
 	u64				nr_failed_migrations_hot;
 	u64				nr_forced_migrations;
+#ifdef CONFIG_NUMA_BALANCING
+	u64				numa_task_migrated;
+	u64				numa_task_swapped;
+#endif
 
 	u64				nr_wakeups;
 	u64				nr_wakeups_sync;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 9e15a088ba38..91a3ce9a2687 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NUMA_HINT_FAULTS,
 		NUMA_HINT_FAULTS_LOCAL,
 		NUMA_PAGE_MIGRATE,
+		NUMA_TASK_MIGRATE,
+		NUMA_TASK_SWAP,
 #endif
 #ifdef CONFIG_MIGRATION
 		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c81cf642dba0..25a92f2abda4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3352,6 +3352,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #ifdef CONFIG_NUMA_BALANCING
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
+	__schedstat_inc(p->stats.numa_task_swapped);
+	count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1);
+
 	if (task_on_rq_queued(p)) {
 		struct rq *src_rq, *dst_rq;
 		struct rq_flags srf, drf;
@@ -7953,8 +7956,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
 		return -EINVAL;
 
-	/* TODO: This is not properly updating schedstats */
-
+	__schedstat_inc(p->stats.numa_task_migrated);
+	count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1);
 	trace_sched_move_numa(p, curr_cpu, target_cpu);
 	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 56ae54e0ce6a..f971c2af7912 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P_SCHEDSTAT(nr_failed_migrations_running);
 		P_SCHEDSTAT(nr_failed_migrations_hot);
 		P_SCHEDSTAT(nr_forced_migrations);
+#ifdef CONFIG_NUMA_BALANCING
+		P_SCHEDSTAT(numa_task_migrated);
+		P_SCHEDSTAT(numa_task_swapped);
+#endif
 		P_SCHEDSTAT(nr_wakeups);
 		P_SCHEDSTAT(nr_wakeups_sync);
 		P_SCHEDSTAT(nr_wakeups_migrate);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c96c1f2b9cf5..cdaab8a957f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -463,6 +463,8 @@ static const unsigned int memcg_vm_event_stat[] = {
 	NUMA_PAGE_MIGRATE,
 	NUMA_PTE_UPDATES,
 	NUMA_HINT_FAULTS,
+	NUMA_TASK_MIGRATE,
+	NUMA_TASK_SWAP,
 #endif
 };
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4c268ce39ff2..ed08bb384ae4 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
 	"numa_hint_faults",
 	"numa_hint_faults_local",
 	"numa_pages_migrated",
+	"numa_task_migrated",
+	"numa_task_swapped",
 #endif
 #ifdef CONFIG_MIGRATION
 	"pgmigrate_success",
-- 
2.25.1



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-04-30 10:36 [PATCH v3] sched/numa: add statistics of numa balance task migration Chen Yu
@ 2025-05-01  7:00 ` Libo Chen
  2025-05-02  9:30   ` Chen, Yu C
  2025-05-05  6:43 ` Jain, Ayush
  1 sibling, 1 reply; 15+ messages in thread
From: Libo Chen @ 2025-05-01  7:00 UTC (permalink / raw)
  To: Chen Yu, Peter Zijlstra, Andrew Morton
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, cgroups,
	linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy

Hi Chen Yu

On 4/30/25 03:36, Chen Yu wrote:
> On systems with NUMA balancing enabled, it is found that tracking
> the task activities due to NUMA balancing is helpful. NUMA balancing
> has two mechanisms for task migration: one is to migrate the task to
> an idle CPU in its preferred node, the other is to swap tasks on
> different nodes if they are on each other's preferred node.
> 
> The kernel already has NUMA page migration statistics in
> /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched,
> but does not have statistics for task migration/swap.
> Add the task migration and swap count accordingly.
> 
> The following two new fields:
> 
> numa_task_migrated
> numa_task_swapped
> 
> will be displayed in both
> /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched
> 

Both stats show up in expected places, but I notice they are also in
/proc/vmstat and are always 0. 

I think you may have to add count_vm_numa_event() in migrate_task_to()
and __migrate_swap_task() unless there is a way to not show both stats
in /proc/vmstat.


> Introducing both pertask and permemcg NUMA balancing statistics helps
> to quickly evaluate the performance and resource usage of the target
> workload. For example, the user can first identify the container which
> has high NUMA balance activity and then narrow down to a specific task
> within that group, and tune the memory policy of that task.
> In summary, it is plausible to iterate the /proc/$pid/sched to find the
> offending task, but the introduction of per memcg tasks' Numa balancing
> aggregated  activity can further help users identify the task in a
> divide-and-conquer way.
> 
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
> v2->v3:
> Remove unnecessary p->mm check because kernel threads are
> not supported by Numa Balancing. (Libo Chen)
> v1->v2:
> Update the Documentation/admin-guide/cgroup-v2.rst. (Michal)
> ---
>  Documentation/admin-guide/cgroup-v2.rst | 6 ++++++
>  include/linux/sched.h                   | 4 ++++
>  include/linux/vm_event_item.h           | 2 ++
>  kernel/sched/core.c                     | 7 +++++--
>  kernel/sched/debug.c                    | 4 ++++
>  mm/memcontrol.c                         | 2 ++
>  mm/vmstat.c                             | 2 ++
>  7 files changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 1a16ce68a4d7..d346f3235945 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1670,6 +1670,12 @@ The following nested keys are defined.
>  	  numa_hint_faults (npn)
>  		Number of NUMA hinting faults.
>  
> +	  numa_task_migrated (npn)
> +		Number of task migration by NUMA balancing.
> +
> +	  numa_task_swapped (npn)
> +		Number of task swap by NUMA balancing.
> +
>  	  pgdemote_kswapd
>  		Number of pages demoted by kswapd.
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f96ac1982893..1c50e30b5c01 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -549,6 +549,10 @@ struct sched_statistics {
>  	u64				nr_failed_migrations_running;
>  	u64				nr_failed_migrations_hot;
>  	u64				nr_forced_migrations;
> +#ifdef CONFIG_NUMA_BALANCING
> +	u64				numa_task_migrated;
> +	u64				numa_task_swapped;
> +#endif
>  

This one is more of personal preference. I understand they show up only if
you turn on schedstats, but will it be better to put them in sched_show_numa()
so they will be printed out next to other numa stats such as numa_pages_migrated?

@@ -1153,6 +1153,10 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
        if (p->mm)
                P(mm->numa_scan_seq);

+       if (schedstat_enabled()) {
+               P_SCHEDSTAT(numa_task_migrated);
+               P_SCHEDSTAT(numa_task_swapped);
+       }
        P(numa_pages_migrated);
        P(numa_preferred_nid);
        P(total_numa_faults);


Thanks,
Libo
>  	u64				nr_wakeups;
>  	u64				nr_wakeups_sync;
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 9e15a088ba38..91a3ce9a2687 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>  		NUMA_HINT_FAULTS,
>  		NUMA_HINT_FAULTS_LOCAL,
>  		NUMA_PAGE_MIGRATE,
> +		NUMA_TASK_MIGRATE,
> +		NUMA_TASK_SWAP,
>  #endif
>  #ifdef CONFIG_MIGRATION
>  		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c81cf642dba0..25a92f2abda4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3352,6 +3352,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
>  #ifdef CONFIG_NUMA_BALANCING
>  static void __migrate_swap_task(struct task_struct *p, int cpu)
>  {
> +	__schedstat_inc(p->stats.numa_task_swapped);
> +	count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1);
> +
>  	if (task_on_rq_queued(p)) {
>  		struct rq *src_rq, *dst_rq;
>  		struct rq_flags srf, drf;
> @@ -7953,8 +7956,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
>  	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
>  		return -EINVAL;
>  
> -	/* TODO: This is not properly updating schedstats */
> -
> +	__schedstat_inc(p->stats.numa_task_migrated);
> +	count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1);
>  	trace_sched_move_numa(p, curr_cpu, target_cpu);
>  	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
>  }
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 56ae54e0ce6a..f971c2af7912 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>  		P_SCHEDSTAT(nr_failed_migrations_running);
>  		P_SCHEDSTAT(nr_failed_migrations_hot);
>  		P_SCHEDSTAT(nr_forced_migrations);
> +#ifdef CONFIG_NUMA_BALANCING
> +		P_SCHEDSTAT(numa_task_migrated);
> +		P_SCHEDSTAT(numa_task_swapped);
> +#endif
>  		P_SCHEDSTAT(nr_wakeups);
>  		P_SCHEDSTAT(nr_wakeups_sync);
>  		P_SCHEDSTAT(nr_wakeups_migrate);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c96c1f2b9cf5..cdaab8a957f3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -463,6 +463,8 @@ static const unsigned int memcg_vm_event_stat[] = {
>  	NUMA_PAGE_MIGRATE,
>  	NUMA_PTE_UPDATES,
>  	NUMA_HINT_FAULTS,
> +	NUMA_TASK_MIGRATE,
> +	NUMA_TASK_SWAP,
>  #endif
>  };
>  
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 4c268ce39ff2..ed08bb384ae4 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
>  	"numa_hint_faults",
>  	"numa_hint_faults_local",
>  	"numa_pages_migrated",
> +	"numa_task_migrated",
> +	"numa_task_swapped",
>  #endif
>  #ifdef CONFIG_MIGRATION
>  	"pgmigrate_success",



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-01  7:00 ` Libo Chen
@ 2025-05-02  9:30   ` Chen, Yu C
  0 siblings, 0 replies; 15+ messages in thread
From: Chen, Yu C @ 2025-05-02  9:30 UTC (permalink / raw)
  To: Libo Chen
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, cgroups,
	linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy, Peter Zijlstra, Andrew Morton

Hi Libo,

On 5/1/2025 3:00 PM, Libo Chen wrote:
> Hi Chen Yu
> 
> On 4/30/25 03:36, Chen Yu wrote:
>> On systems with NUMA balancing enabled, it is found that tracking
>> the task activities due to NUMA balancing is helpful. NUMA balancing
>> has two mechanisms for task migration: one is to migrate the task to
>> an idle CPU in its preferred node, the other is to swap tasks on
>> different nodes if they are on each other's preferred node.
>>
>> The kernel already has NUMA page migration statistics in
>> /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched,
>> but does not have statistics for task migration/swap.
>> Add the task migration and swap count accordingly.
>>
>> The following two new fields:
>>
>> numa_task_migrated
>> numa_task_swapped
>>
>> will be displayed in both
>> /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched
>>
> 
> Both stats show up in expected places, but I notice they are also in
> /proc/vmstat and are always 0.
> 
> I think you may have to add count_vm_numa_event() in migrate_task_to()
> and __migrate_swap_task() unless there is a way to not show both stats
> in /proc/vmstat.
> 
> 

Thanks for catching this. For some reasons, I added these
"placeholders" to the vmstat but didn't populate them. Let
me remove these items from vmstat because we mainly care
about task activity rather than page activity(not a vm event
I suppose) Let me have a try on this.

>> Introducing both pertask and permemcg NUMA balancing statistics helps
>> to quickly evaluate the performance and resource usage of the target
>> workload. For example, the user can first identify the container which
>> has high NUMA balance activity and then narrow down to a specific task
>> within that group, and tune the memory policy of that task.
>> In summary, it is plausible to iterate the /proc/$pid/sched to find the
>> offending task, but the introduction of per memcg tasks' Numa balancing
>> aggregated  activity can further help users identify the task in a
>> divide-and-conquer way.
>>
>> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
>> Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
>> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
>> ---
>> v2->v3:
>> Remove unnecessary p->mm check because kernel threads are
>> not supported by Numa Balancing. (Libo Chen)
>> v1->v2:
>> Update the Documentation/admin-guide/cgroup-v2.rst. (Michal)
>> ---
>>   Documentation/admin-guide/cgroup-v2.rst | 6 ++++++
>>   include/linux/sched.h                   | 4 ++++
>>   include/linux/vm_event_item.h           | 2 ++
>>   kernel/sched/core.c                     | 7 +++++--
>>   kernel/sched/debug.c                    | 4 ++++
>>   mm/memcontrol.c                         | 2 ++
>>   mm/vmstat.c                             | 2 ++
>>   7 files changed, 25 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
>> index 1a16ce68a4d7..d346f3235945 100644
>> --- a/Documentation/admin-guide/cgroup-v2.rst
>> +++ b/Documentation/admin-guide/cgroup-v2.rst
>> @@ -1670,6 +1670,12 @@ The following nested keys are defined.
>>   	  numa_hint_faults (npn)
>>   		Number of NUMA hinting faults.
>>   
>> +	  numa_task_migrated (npn)
>> +		Number of task migration by NUMA balancing.
>> +
>> +	  numa_task_swapped (npn)
>> +		Number of task swap by NUMA balancing.
>> +
>>   	  pgdemote_kswapd
>>   		Number of pages demoted by kswapd.
>>   
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index f96ac1982893..1c50e30b5c01 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -549,6 +549,10 @@ struct sched_statistics {
>>   	u64				nr_failed_migrations_running;
>>   	u64				nr_failed_migrations_hot;
>>   	u64				nr_forced_migrations;
>> +#ifdef CONFIG_NUMA_BALANCING
>> +	u64				numa_task_migrated;
>> +	u64				numa_task_swapped;
>> +#endif
>>   
> 
> This one is more of personal preference. I understand they show up only if
> you turn on schedstats, but will it be better to put them in sched_show_numa()
> so they will be printed out next to other numa stats such as numa_pages_migrated?
> 
> @@ -1153,6 +1153,10 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
>          if (p->mm)
>                  P(mm->numa_scan_seq);
> 
> +       if (schedstat_enabled()) {
> +               P_SCHEDSTAT(numa_task_migrated);
> +               P_SCHEDSTAT(numa_task_swapped);
> +       }
>          P(numa_pages_migrated);
>          P(numa_preferred_nid);
>          P(total_numa_faults);
> 
> 

Previously, the numa_task_migrated and numa_task_swapped were
put under the scope of schedstat_enabled() in
proc_sched_show_task().

We mainly care about task migration activity, so it is put near
the nr_forced_migrations. When it reaches sched_show_numa(),
P_SCHEDSTAT has been undefined. It is just simpler to do this
directly in proc_sched_show_task() IMO.

Thanks,
Chenyu


> Thanks,
> Libo
>>   	u64				nr_wakeups;
>>   	u64				nr_wakeups_sync;
>> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
>> index 9e15a088ba38..91a3ce9a2687 100644
>> --- a/include/linux/vm_event_item.h
>> +++ b/include/linux/vm_event_item.h
>> @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>>   		NUMA_HINT_FAULTS,
>>   		NUMA_HINT_FAULTS_LOCAL,
>>   		NUMA_PAGE_MIGRATE,
>> +		NUMA_TASK_MIGRATE,
>> +		NUMA_TASK_SWAP,
>>   #endif
>>   #ifdef CONFIG_MIGRATION
>>   		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index c81cf642dba0..25a92f2abda4 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -3352,6 +3352,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
>>   #ifdef CONFIG_NUMA_BALANCING
>>   static void __migrate_swap_task(struct task_struct *p, int cpu)
>>   {
>> +	__schedstat_inc(p->stats.numa_task_swapped);
>> +	count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1);
>> +
>>   	if (task_on_rq_queued(p)) {
>>   		struct rq *src_rq, *dst_rq;
>>   		struct rq_flags srf, drf;
>> @@ -7953,8 +7956,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
>>   	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
>>   		return -EINVAL;
>>   
>> -	/* TODO: This is not properly updating schedstats */
>> -
>> +	__schedstat_inc(p->stats.numa_task_migrated);
>> +	count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1);
>>   	trace_sched_move_numa(p, curr_cpu, target_cpu);
>>   	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
>>   }
>> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
>> index 56ae54e0ce6a..f971c2af7912 100644
>> --- a/kernel/sched/debug.c
>> +++ b/kernel/sched/debug.c
>> @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>>   		P_SCHEDSTAT(nr_failed_migrations_running);
>>   		P_SCHEDSTAT(nr_failed_migrations_hot);
>>   		P_SCHEDSTAT(nr_forced_migrations);
>> +#ifdef CONFIG_NUMA_BALANCING
>> +		P_SCHEDSTAT(numa_task_migrated);
>> +		P_SCHEDSTAT(numa_task_swapped);
>> +#endif
>>   		P_SCHEDSTAT(nr_wakeups);
>>   		P_SCHEDSTAT(nr_wakeups_sync);
>>   		P_SCHEDSTAT(nr_wakeups_migrate);
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index c96c1f2b9cf5..cdaab8a957f3 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -463,6 +463,8 @@ static const unsigned int memcg_vm_event_stat[] = {
>>   	NUMA_PAGE_MIGRATE,
>>   	NUMA_PTE_UPDATES,
>>   	NUMA_HINT_FAULTS,
>> +	NUMA_TASK_MIGRATE,
>> +	NUMA_TASK_SWAP,
>>   #endif
>>   };
>>   
>> diff --git a/mm/vmstat.c b/mm/vmstat.c
>> index 4c268ce39ff2..ed08bb384ae4 100644
>> --- a/mm/vmstat.c
>> +++ b/mm/vmstat.c
>> @@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
>>   	"numa_hint_faults",
>>   	"numa_hint_faults_local",
>>   	"numa_pages_migrated",
>> +	"numa_task_migrated",
>> +	"numa_task_swapped",
>>   #endif
>>   #ifdef CONFIG_MIGRATION
>>   	"pgmigrate_success",
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-04-30 10:36 [PATCH v3] sched/numa: add statistics of numa balance task migration Chen Yu
  2025-05-01  7:00 ` Libo Chen
@ 2025-05-05  6:43 ` Jain, Ayush
  2025-05-05 15:03   ` Chen, Yu C
  1 sibling, 1 reply; 15+ messages in thread
From: Jain, Ayush @ 2025-05-05  6:43 UTC (permalink / raw)
  To: Chen Yu, Peter Zijlstra, Andrew Morton
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, Libo Chen,
	cgroups, linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy, Neeraj.Upadhyay


Hello,

Hitting Kernel Panic on latest-next while running rcutorture tests

37ff6e9a2ce3 ("Add linux-next specific files for 20250502")

reverting this patch fixes it
3b2339eeb032 ("sched-numa-add-statistics-of-numa-balance-task-migration-v3")
https://web.git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/kernel/sched/core.c?id=3b2339eeb032627e9329daf70a4ba8cd62c9cc8d

by looking at RIP pointer

$ ./scripts/faddr2line vmlinux __migrate_swap_task+0x2e/0x180
__migrate_swap_task+0x2e/0x180:
count_memcg_events_mm at include/linux/memcontrol.h:987
(inlined by) count_memcg_events_mm at include/linux/memcontrol.h:978
(inlined by) __migrate_swap_task at kernel/sched/core.c:3356

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
mm->owner -> NULL

Attaching kernel logs below:

[ 1070.635450] rcu-torture: rcu_torture_read_exit: End of episode
[ 1074.047617] BUG: kernel NULL pointer dereference, address:
0000000000000498
[ 1074.054577] #PF: supervisor read access in kernel mode
[ 1074.059718] #PF: error_code(0x0000) - not-present page
[ 1074.064856] PGD 0 P4D 0
[ 1074.067395] Oops: Oops: 0000 [#1] SMP NOPTI
[ 1074.071583] CPU: 48 UID: 0 PID: 307 Comm: migration/48 Not tainted
6.15.0-rc4-next-20250502-37ff6e9a2ce3-1746413815614 #1 PREEMPT(voluntary)
[ 1074.084258] Hardware name: Dell Inc. PowerEdge R6515/0R4CNN, BIOS
2.16.0 07/09/2024
[ 1074.091913] Stopper: multi_cpu_stop+0x0/0x130 <- migrate_swap+0xad/0x120
[ 1074.098619] RIP: 0010:__migrate_swap_task+0x2e/0x180
[ 1074.103585] Code: 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53
48 63 de 48 83 87 a0 03 00 00 01 66 90 4c 8b af 50 09 00 00 e8 c2 47 07
00 <49> 8b bd 98 04 00 00 e8 26 11 36 00 48 89 c7 48 85 c0 74 0a be 3b
[ 1074.122332] RSP: 0018:ffffa4bc4d54bdb0 EFLAGS: 00010002
[ 1074.127557] RAX: 0000000000000001 RBX: 0000000000000007 RCX:
0000000000000000
[ 1074.134688] RDX: ffff8d80c01fcec0 RSI: 0000000000000007 RDI:
ffff8d2153c93480
[ 1074.141822] RBP: ffffa4bc4d54bdd8 R08: 000000fa1239fb41 R09:
ffff8d9f3e832380
[ 1074.148955] R10: 0000000000000004 R11: 0000000000000001 R12:
ffff8d2153c93480
[ 1074.156088] R13: 0000000000000000 R14: ffff8d60dc9ac14c R15:
ffff8d2153c9414c
[ 1074.163218] FS:  0000000000000000(0000) GS:ffff8d9f8a626000(0000)
knlGS:0000000000000000
[ 1074.171306] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1074.177051] CR2: 000000000000049op+0x10/0x10
[ 1074.203665]  cpu_stopper_thread+0xa6/0x160
[ 1074.207767]  smpboot_thread_fn+0x122/0x280
[ 1074.211866]  kthread+0x11a/0x230
[ 1074.215098]  ? __pfx_smpboot_thread_fn+0x10/0x10
[ 1074.219717]  ? _raw_spin_unlock_irq+0x28/0x50
[ 1074.224076]  ? __pfx_kthread+0x10/0x10
[ 1074.227829]  ret_from_fork+0x40/0x60
[ 1074.231407]  ? __pfx_kthread+0x10/0x10
[ 1074.235161]  ret_from_fork_asm+0x1a/0x30
[ 1074.239089]  </TASK>
[ 1074.241279] Modules linked in: rcutorture torture xt_tcpudp
nft_compat nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6
nf_defrag_ipv4 nf_tables nfnetlink binfmt_misc ipmi_ssif nls_iso8859_1
intel_rapl_msr intel_rapl_common amd64_edac edac_mce_amd kvm_amd
dell_smbios wmi_bmof kvm dell_wmi_descriptor dcdbas rapl ccp k10temp
acpi_power_meter ptdma wmi ipmi_si acpi_ipmi ipmi_devintf
ipmi_msghandler mac_hid sch_fq_codel dm_multipath scsi_dh_rdac
scsi_dh_emc scsi_dh_alua msr fuse efi_pstore ip_tables x_tables autofs4
btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy
async_pq async_xor async_tx xor raid6_pq raid1 raid0 mgag200
drm_client_lib i2c_algo_bit drm_shmem_helper drm_kms_helper nvme
ghash_clmulni_intel drm tg3 mpt3sas nvme_core ahci bnxt_en i2c_piix4
raid_class libahci i2c_smbus scsi_transport_sas aesni_intel [last
unloaded: torture]
[ 1074.316817] CR2: 0000000000000498
[ 1074.320135] ---[ end trace 0000000000000000 ]---
[ 1074.418846] pstore: backend (erst) writing error (-28)
[ 1074.423983] RIP: 0010:__migrate_swap_task+0x2e/0x180
[ 1074.428949] Code: 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53
48 63 de 48 83 87 a0 03 00 00 01 66 90 4c 8b af 50 09 00 00 e8 c2 47 07
00 <49> 8b bd 98 04 00 00 e8 26 11 36 00 48 89 c7 48 85 c0 74 0a be 3b
[ 1074.447694] RSP: 0018:ffffa4bc4d54bdb0 EFLAGS: 00010002
[ 1074.452919] RAX: 0000000000000001 RBX: 0000000000000007 RCX:
0000000000000000
[ 1074.460051] RDX: ffff8d80c01fcec0 RSI: 0000000000000007 RDI:
ffff8d2153c93480
[ 1074.467184] RBP: ffffa4bc4d54bdd8 R08: 000000fa1239fb41 R09:
ffff8d9f3e832380
[ 1074.474317] R10: 0000000000000004 R11: 0000000000000001 R12:
ffff8d2153c93480
[ 1074.481450] R13: 0000000000000000 R14: ffff8d60dc9ac14c R15:
ffff8d2153c9414c
[ 1074.488581] FS:  0000000000000000(0000) GS:ffff8d9f8a626000(0000)
knlGS:0000000000000000
[ 1074.496666] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1074.502414] CR2: 0000000000000498 CR3: 000000409341a002 CR4:
0000000000770ef0
[ 1074.509547] PKRU: 55555554
[ 1074.512258] note: migration/48[307] exited with irqs disabled
[ 1084.683268] watchdog: CPU6: Watchdog detected hard LOCKUP on cpu 6
[ 1084.683274] Modules linked in: rcutorture torture xt_tcpudp
nft_compat nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6
nf_defrag_ipv4 nf_tables nfnetlink binfmt_misc ipmi_ssif nls_iso8859_1
intel_rapl_msr intel_rapl_common amd64_edac edac_mce_amd kvm_amd
dell_smbios wmi_bmof kvm dell_wmi_descriptor dcdbas rapl ccp k10temp
acpi_power_meter ptdma wmi ipmi_si acpi_ipmi ipmi_devintf
ipmi_msghandler mac_hid sch_fq_codel dm_multipath scsi_dh_rdac
scsi_dh_emc scsi_dh_alua msr fuse efi_pstore ip_tables x_tables autofs4
btrfs blake2b_generic raid10 raid456 async_raid6_recov async_memcpy
async_pq async_xor async_tx xor raid6_pq raid1 raid0 mgag200
drm_client_lib i2c_algo_bit drm_shmem_helper drm_kms_helper nvme
ghash_clmulni_intel drm tg3 mpt3sas nvme_core ahci bnxt_en i2c_piix4
raid_class libahci i2c_smbus scsi_transport_sas aesni_intel [last
unloaded: torture]
[ 1084.683352] CPU: 6 UID: 0 PID: 83659 Comm: rcu_torture_rea Tainted: G
     D             6.15.0-rc4-next-20250502-37ff6e9a2ce3-1746413815614
#1 PREEMPT(voluntary)
[ 1084.683357] Tainted: [D]=DIE
[ 1084.683358] Hardware name: Dell Inc. PowerEdge R6515/0R4CNN, BIOS
2.16.0 07/09/2024
[ 1084.683360] RIP: 0010:native_queued_spin_lock_slowpath+0x2b4/0x300
[ 1084.683368] Code: 63 ff 4c 8d a8 c0 d1 20 b4 49 81 ff ff 1f 00 00 77
46 4e 03 2c fd e0 5e f7 b2 49 89 5d 00 8b 43 08 85 c0 75 09 f3 90 8b 43
08 <85> c0 74 f7 48 8b 13 48 85 d2 0f 84 5e ff ff ff 0f 0d 0a e9 56 ff
[ 1084.683370] RSP: 0018:ffffa4bc6b503a28 EFLAGS: 00000046
[ 1084.683373] RAX: 0000000000000000 RBX: ffff8d403f9b31c0 RCX:
0000000000000008
[ 1084.683375] RDX: 0000000000000047 RSI: 00000000011c0100 RDI:
ffff8d403f9f2280
[ 1084.683376] RBP: ffffa4bc6b503a50 R08: 0000000000000080 R09:
ffffffffffffff00
[ 1084.683377] R10: 0000000000000000 R11: 0000000000000080 R12:
ffff8d403f9f2280
[ 1084.683379] R13: ffff8d403fdb31c0 R14: 00000000001c0000 R15:
0000000000000046
[ 1084.683380] FS:  0000000000000000(0000) GS:ffff8d408b7a6000(0000)
knlGS:0000000000000000
[ 1084.683382] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1084.683384] CR2: 00007f54f32a3010 CR3: 000000209f547008 CR4:
0000000000770ef0
[ 1084.683385] PKRU: 55555554
[ 1084.683387] Call Trace:
[ 1084.683388]  <TASK>
[ 1084.683395]  _raw_spin_lock+0x3c/0x50
[ 1084.683399]  raw_spin_rq_lock_nested+0x28/0xa0
[ 1084.683404]  _raw_spin_rq_lock_irqsave+0x29/0x60
[ 1084.683408]  sched_balance_rq+0x6c8/0x1430
[ 1084.683412]  ? srso_alias_return_thunk+0x5/0xfbef5
[ 1084.683422]  sched_balance_newidle+0x1ba/0x450
[ 1084.683426]  pick_next_task_fair+0x39/0x500
[ 1084.683429]  ? srso_alias_return_thunk+0x5/0xfbef5
[ 1084.683431]  ? dequeue_task_fair+0xb1/0x1b0
[ 1084.683433]  ? srso_alias_return_thunk+0x5/0xfbef5
[ 1084.683436]  __pick_next_task+0x43/0x1b0
[ 1084.683440]  __schedule+0x20c/0x15b0
[ 1084.683443]  ? trace_preempt_on+0x1f/0x70
[ 1084.683447]  ? srso_alias_return_thunk+0x5/0xfbef5
[ 1084.683450]  ? preempt_count_sub+0x50/0x80
[ 1084.683452]  ? srso_alias_return_thunk+0x5/0xfbef5
[ 1084.683455]  ? hrtimer_start_range_ns+0x137/0x4b0
[ 1084.683459]  ? srso_alias_return_thunk+0x5/0xfbef5
[ 1084.683463]  schedule+0x_us+0x23/0x30 [torture]
[ 1084.683489]  rcu_torture_reader+0x138/0x200 [rcutorture]
[ 1084.683496]  ? __pfx_rcu_torture_timer+0x10/0x10 [rcutorture]
[ 1084.683503]  kthread+0x11a/0x230
[ 1084.683507]  ? __pfx_rcu_torture_reader+0x10/0x10 [rcutorture]
[ 1084.683512]  ? _raw_spin_unlock_irq+0x28/0x50
[ 1084.683516]  ? __pfx_kthread+0x10/0x10
[ 1084.683519]  ret_from_fork+0x40/0x60
[ 1084.683524]  ? __pfx_kthread+0x10/0x10
[ 1084.683527]  ret_from_fork_asm+0x1a/0x30
[ 1084.683535]  </TASK>
[ 1084.683537] Kernel panic - not syncing: Hard LOCKUP
[ 1086.154471] Shutting down cpus with NMI
[ 1086.169269] Kernel Offset: 0x30200000 from 0xffffffff81000000
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[ 1086.583614] ---[ end Kernel panic - not syncing: Hard LOCKUP ]---


Test recreate steps:
1. Load rcutorture module to machine
2. Toggle cpu status (Online/offline)

https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/generic/rcutorture.py


Reported same at:
https://lore.kernel.org/linux-next/8f746aa3-9ee6-45a8-84b1-da335be17c2e@amd.com/T/#mc98b701dcd3667ff8a18de8581936ee257238884

Reported-by: Ayush Jain <Ayush.jain3@amd.com>
Let me know, if more details are needed from my end

Thanks and Regards,
Ayush Jain

On 4/30/2025 4:06 PM, Chen Yu wrote:
> On systems with NUMA balancing enabled, it is found that tracking
> the task activities due to NUMA balancing is helpful. NUMA balancing
> has two mechanisms for task migration: one is to migrate the task to
> an idle CPU in its preferred node, the other is to swap tasks on
> different nodes if they are on each other's preferred node.
> 
> The kernel already has NUMA page migration statistics in
> /sys/fs/cgroup/mytest/memory.stat and /proc/{PID}/sched,
> but does not have statistics for task migration/swap.
> Add the task migration and swap count accordingly.
> 
> The following two new fields:
> 
> numa_task_migrated
> numa_task_swapped
> 
> will be displayed in both
> /sys/fs/cgroup/{GROUP}/memory.stat and /proc/{PID}/sched
> 
> Introducing both pertask and permemcg NUMA balancing statistics helps
> to quickly evaluate the performance and resource usage of the target
> workload. For example, the user can first identify the container which
> has high NUMA balance activity and then narrow down to a specific task
> within that group, and tune the memory policy of that task.
> In summary, it is plausible to iterate the /proc/$pid/sched to find the
> offending task, but the introduction of per memcg tasks' Numa balancing
> aggregated  activity can further help users identify the task in a
> divide-and-conquer way.
> 
> Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
> Tested-by: Madadi Vineeth Reddy <vineethr@linux.ibm.com>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Chen Yu <yu.c.chen@intel.com>
> ---
> v2->v3:
> Remove unnecessary p->mm check because kernel threads are
> not supported by Numa Balancing. (Libo Chen)
> v1->v2:
> Update the Documentation/admin-guide/cgroup-v2.rst. (Michal)
> ---
>  Documentation/admin-guide/cgroup-v2.rst | 6 ++++++
>  include/linux/sched.h                   | 4 ++++
>  include/linux/vm_event_item.h           | 2 ++
>  kernel/sched/core.c                     | 7 +++++--
>  kernel/sched/debug.c                    | 4 ++++
>  mm/memcontrol.c                         | 2 ++
>  mm/vmstat.c                             | 2 ++
>  7 files changed, 25 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 1a16ce68a4d7..d346f3235945 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1670,6 +1670,12 @@ The following nested keys are defined.
>  	  numa_hint_faults (npn)
>  		Number of NUMA hinting faults.
>  
> +	  numa_task_migrated (npn)
> +		Number of task migration by NUMA balancing.
> +
> +	  numa_task_swapped (npn)
> +		Number of task swap by NUMA balancing.
> +
>  	  pgdemote_kswapd
>  		Number of pages demoted by kswapd.
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f96ac1982893..1c50e30b5c01 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -549,6 +549,10 @@ struct sched_statistics {
>  	u64				nr_failed_migrations_running;
>  	u64				nr_failed_migrations_hot;
>  	u64				nr_forced_migrations;
> +#ifdef CONFIG_NUMA_BALANCING
> +	u64				numa_task_migrated;
> +	u64				numa_task_swapped;
> +#endif
>  
>  	u64				nr_wakeups;
>  	u64				nr_wakeups_sync;
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 9e15a088ba38..91a3ce9a2687 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -66,6 +66,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>  		NUMA_HINT_FAULTS,
>  		NUMA_HINT_FAULTS_LOCAL,
>  		NUMA_PAGE_MIGRATE,
> +		NUMA_TASK_MIGRATE,
> +		NUMA_TASK_SWAP,
>  #endif
>  #ifdef CONFIG_MIGRATION
>  		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c81cf642dba0..25a92f2abda4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3352,6 +3352,9 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
>  #ifdef CONFIG_NUMA_BALANCING
>  static void __migrate_swap_task(struct task_struct *p, int cpu)
>  {
> +	__schedstat_inc(p->stats.numa_task_swapped);
> +	count_memcg_events_mm(p->mm, NUMA_TASK_SWAP, 1);
> +
>  	if (task_on_rq_queued(p)) {
>  		struct rq *src_rq, *dst_rq;
>  		struct rq_flags srf, drf;
> @@ -7953,8 +7956,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
>  	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
>  		return -EINVAL;
>  
> -	/* TODO: This is not properly updating schedstats */
> -
> +	__schedstat_inc(p->stats.numa_task_migrated);
> +	count_memcg_events_mm(p->mm, NUMA_TASK_MIGRATE, 1);
>  	trace_sched_move_numa(p, curr_cpu, target_cpu);
>  	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
>  }
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 56ae54e0ce6a..f971c2af7912 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1206,6 +1206,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>  		P_SCHEDSTAT(nr_failed_migrations_running);
>  		P_SCHEDSTAT(nr_failed_migrations_hot);
>  		P_SCHEDSTAT(nr_forced_migrations);
> +#ifdef CONFIG_NUMA_BALANCING
> +		P_SCHEDSTAT(numa_task_migrated);
> +		P_SCHEDSTAT(numa_task_swapped);
> +#endif
>  		P_SCHEDSTAT(nr_wakeups);
>  		P_SCHEDSTAT(nr_wakeups_sync);
>  		P_SCHEDSTAT(nr_wakeups_migrate);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c96c1f2b9cf5..cdaab8a957f3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -463,6 +463,8 @@ static const unsigned int memcg_vm_event_stat[] = {
>  	NUMA_PAGE_MIGRATE,
>  	NUMA_PTE_UPDATES,
>  	NUMA_HINT_FAULTS,
> +	NUMA_TASK_MIGRATE,
> +	NUMA_TASK_SWAP,
>  #endif
>  };
>  
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 4c268ce39ff2..ed08bb384ae4 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1347,6 +1347,8 @@ const char * const vmstat_text[] = {
>  	"numa_hint_faults",
>  	"numa_hint_faults_local",
>  	"numa_pages_migrated",
> +	"numa_task_migrated",
> +	"numa_task_swapped",
>  #endif
>  #ifdef CONFIG_MIGRATION
>  	"pgmigrate_success",



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05  6:43 ` Jain, Ayush
@ 2025-05-05 15:03   ` Chen, Yu C
  2025-05-05 17:25     ` Venkat Rao Bagalkote
  2025-05-05 17:46     ` Michal Koutný
  0 siblings, 2 replies; 15+ messages in thread
From: Chen, Yu C @ 2025-05-05 15:03 UTC (permalink / raw)
  To: Jain, Ayush, Andrew Morton
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, Libo Chen,
	cgroups, linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy, Neeraj.Upadhyay, Peter Zijlstra

On 5/5/2025 2:43 PM, Jain, Ayush wrote:
> 
> Hello,
> 
> Hitting Kernel Panic on latest-next while running rcutorture tests
> 
> 37ff6e9a2ce3 ("Add linux-next specific files for 20250502")
> 
> reverting this patch fixes it
> 3b2339eeb032 ("sched-numa-add-statistics-of-numa-balance-task-migration-v3")
> https://web.git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/kernel/sched/core.c?id=3b2339eeb032627e9329daf70a4ba8cd62c9cc8d
> 
> by looking at RIP pointer
> 
> $ ./scripts/faddr2line vmlinux __migrate_swap_task+0x2e/0x180
> __migrate_swap_task+0x2e/0x180:
> count_memcg_events_mm at include/linux/memcontrol.h:987
> (inlined by) count_memcg_events_mm at include/linux/memcontrol.h:978
> (inlined by) __migrate_swap_task at kernel/sched/core.c:3356
> 
> memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
> mm->owner -> NULL
> 
> Attaching kernel logs below:
> 
> [ 1070.635450] rcu-torture: rcu_torture_read_exit: End of episode
> [ 1074.047617] BUG: kernel NULL pointer dereference, address:
> 0000000000000498

Thanks Ayush,

According to this address,
    4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
    49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
It seems that this task to be swapped has NULL mm_struct.

Does the following help?

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 96db6947bc92..0cb8cc4d551d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3353,7 +3353,8 @@ void set_task_cpu(struct task_struct *p, unsigned 
int new_cpu)
  static void __migrate_swap_task(struct task_struct *p, int cpu)
  {
         __schedstat_inc(p->stats.numa_task_swapped);
-       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
+       if (p->mm)
+               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);

         if (task_on_rq_queued(p)) {
                 struct rq *src_rq, *dst_rq;



Hi Andrew,
May I know if we can hold this patch and not merge it for now,
besides this regression, Libo has another comment related to
this patch and I'll address it in next version. Sorry for
inconvenience.

thanks,
Chenyu


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 15:03   ` Chen, Yu C
@ 2025-05-05 17:25     ` Venkat Rao Bagalkote
  2025-05-07 11:36       ` Chen, Yu C
  2025-05-05 17:46     ` Michal Koutný
  1 sibling, 1 reply; 15+ messages in thread
From: Venkat Rao Bagalkote @ 2025-05-05 17:25 UTC (permalink / raw)
  To: Chen, Yu C, Jain, Ayush, Andrew Morton
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, Libo Chen,
	cgroups, linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy, Neeraj.Upadhyay, Peter Zijlstra,
	Madhavan Srinivasan


On 05/05/25 8:33 pm, Chen, Yu C wrote:
> On 5/5/2025 2:43 PM, Jain, Ayush wrote:
>>
>> Hello,
>>
>> Hitting Kernel Panic on latest-next while running rcutorture tests
>>
>> 37ff6e9a2ce3 ("Add linux-next specific files for 20250502")
>>
>> reverting this patch fixes it
>> 3b2339eeb032 
>> ("sched-numa-add-statistics-of-numa-balance-task-migration-v3")
>> https://web.git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/kernel/sched/core.c?id=3b2339eeb032627e9329daf70a4ba8cd62c9cc8d 
>>
>>
>> by looking at RIP pointer
>>
>> $ ./scripts/faddr2line vmlinux __migrate_swap_task+0x2e/0x180
>> __migrate_swap_task+0x2e/0x180:
>> count_memcg_events_mm at include/linux/memcontrol.h:987
>> (inlined by) count_memcg_events_mm at include/linux/memcontrol.h:978
>> (inlined by) __migrate_swap_task at kernel/sched/core.c:3356
>>
>> memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
>> mm->owner -> NULL
>>
>> Attaching kernel logs below:
>>
>> [ 1070.635450] rcu-torture: rcu_torture_read_exit: End of episode
>> [ 1074.047617] BUG: kernel NULL pointer dereference, address:
>> 0000000000000498
>
> Thanks Ayush,
>
> According to this address,
>    4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>    49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
> It seems that this task to be swapped has NULL mm_struct.
>
> Does the following help?
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 96db6947bc92..0cb8cc4d551d 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3353,7 +3353,8 @@ void set_task_cpu(struct task_struct *p, 
> unsigned int new_cpu)
>  static void __migrate_swap_task(struct task_struct *p, int cpu)
>  {
>         __schedstat_inc(p->stats.numa_task_swapped);
> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
> +       if (p->mm)
> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>
>         if (task_on_rq_queued(p)) {
>                 struct rq *src_rq, *dst_rq;
>

Hello Chenyu,


This issue is reported even on IBM Power servers.

Proposed fix works fine. Hence,


Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>


Regards,

Venkat.

>
>
> Hi Andrew,
> May I know if we can hold this patch and not merge it for now,
> besides this regression, Libo has another comment related to
> this patch and I'll address it in next version. Sorry for
> inconvenience.
>
> thanks,
> Chenyu
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 15:03   ` Chen, Yu C
  2025-05-05 17:25     ` Venkat Rao Bagalkote
@ 2025-05-05 17:46     ` Michal Koutný
  2025-05-05 18:27       ` Chen, Yu C
  1 sibling, 1 reply; 15+ messages in thread
From: Michal Koutný @ 2025-05-05 17:46 UTC (permalink / raw)
  To: Chen, Yu C
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, Libo Chen, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 858 bytes --]

On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
> According to this address,
>    4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>    49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
> It seems that this task to be swapped has NULL mm_struct.

So it's likely a kernel thread. Does it make sense to NUMA balance
those? (I naïvely think it doesn't, please correct me.) ...

>  static void __migrate_swap_task(struct task_struct *p, int cpu)
>  {
>         __schedstat_inc(p->stats.numa_task_swapped);
> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
> +       if (p->mm)
> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);

... proper fix should likely guard this earlier, like the guard in
task_numa_fault() but for the other swapped task.

Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 17:46     ` Michal Koutný
@ 2025-05-05 18:27       ` Chen, Yu C
  2025-05-05 18:49         ` Libo Chen
  0 siblings, 1 reply; 15+ messages in thread
From: Chen, Yu C @ 2025-05-05 18:27 UTC (permalink / raw)
  To: Michal Koutný
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, Libo Chen, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra

Hi Michal,

On 5/6/2025 1:46 AM, Michal Koutný wrote:
> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>> According to this address,
>>     4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>     49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>> It seems that this task to be swapped has NULL mm_struct.
> 
> So it's likely a kernel thread. Does it make sense to NUMA balance
> those? (I naïvely think it doesn't, please correct me.) ...
> 

I agree kernel threads are not supposed to be covered by
NUMA balance, because currently NUMA balance only considers
user pages via VMAs, and one question below:

>>   static void __migrate_swap_task(struct task_struct *p, int cpu)
>>   {
>>          __schedstat_inc(p->stats.numa_task_swapped);
>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>> +       if (p->mm)
>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
> 
> ... proper fix should likely guard this earlier, like the guard in
> task_numa_fault() but for the other swapped task.
I see. For task swapping in task_numa_compare(),
it is triggered when there are no idle CPUs in task A's
preferred node.
In this case, we choose a task B on A's preferred node,
and swap B with A. This helps improve A's Numa locality
without introducing the load imbalance between Nodes.

But B's Numa node preference is not mandatory in
current implementation IIUC, because B's load is mainly
considered. That is to say, is it legit to swap a
Numa sensitive task A with a non-Numa sensitive kernel
thread B? If not, I think we can add kernel thread
check in task swap like the guard in
task_tick_numa()/task_numa_fault().

thanks,
Chenyu

> 
> Michal


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 18:27       ` Chen, Yu C
@ 2025-05-05 18:49         ` Libo Chen
  2025-05-05 21:32           ` Libo Chen
  0 siblings, 1 reply; 15+ messages in thread
From: Libo Chen @ 2025-05-05 18:49 UTC (permalink / raw)
  To: Chen, Yu C, Michal Koutný
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra



On 5/5/25 11:27, Chen, Yu C wrote:
> Hi Michal,
> 
> On 5/6/2025 1:46 AM, Michal Koutný wrote:
>> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>>> According to this address,
>>>     4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>>     49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>>> It seems that this task to be swapped has NULL mm_struct.
>>
>> So it's likely a kernel thread. Does it make sense to NUMA balance
>> those? (I naïvely think it doesn't, please correct me.) ...
>>
> 
> I agree kernel threads are not supposed to be covered by
> NUMA balance, because currently NUMA balance only considers
> user pages via VMAs, and one question below:
> 
>>>   static void __migrate_swap_task(struct task_struct *p, int cpu)
>>>   {
>>>          __schedstat_inc(p->stats.numa_task_swapped);
>>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>> +       if (p->mm)
>>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>
>> ... proper fix should likely guard this earlier, like the guard in
>> task_numa_fault() but for the other swapped task.
> I see. For task swapping in task_numa_compare(),
> it is triggered when there are no idle CPUs in task A's
> preferred node.
> In this case, we choose a task B on A's preferred node,
> and swap B with A. This helps improve A's Numa locality
> without introducing the load imbalance between Nodes.
> 
> But B's Numa node preference is not mandatory in
> current implementation IIUC, because B's load is mainly

hmm, that's doesn't seem to be right, can we choose B that
is not a kthread from A's preferred node?

> considered. That is to say, is it legit to swap a
> Numa sensitive task A with a non-Numa sensitive kernel
> thread B? If not, I think we can add kernel thread
> check in task swap like the guard in
> task_tick_numa()/task_numa_fault().
> 


> thanks,
> Chenyu
> 
>>
>> Michal
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 18:49         ` Libo Chen
@ 2025-05-05 21:32           ` Libo Chen
  2025-05-05 21:57             ` Libo Chen
  0 siblings, 1 reply; 15+ messages in thread
From: Libo Chen @ 2025-05-05 21:32 UTC (permalink / raw)
  To: Chen, Yu C, Michal Koutný
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra



On 5/5/25 11:49, Libo Chen wrote:
> 
> 
> On 5/5/25 11:27, Chen, Yu C wrote:
>> Hi Michal,
>>
>> On 5/6/2025 1:46 AM, Michal Koutný wrote:
>>> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>>>> According to this address,
>>>>     4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>>>     49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>>>> It seems that this task to be swapped has NULL mm_struct.
>>>
>>> So it's likely a kernel thread. Does it make sense to NUMA balance
>>> those? (I naïvely think it doesn't, please correct me.) ...
>>>
>>
>> I agree kernel threads are not supposed to be covered by
>> NUMA balance, because currently NUMA balance only considers
>> user pages via VMAs, and one question below:
>>
>>>>   static void __migrate_swap_task(struct task_struct *p, int cpu)
>>>>   {
>>>>          __schedstat_inc(p->stats.numa_task_swapped);
>>>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>> +       if (p->mm)
>>>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>
>>> ... proper fix should likely guard this earlier, like the guard in
>>> task_numa_fault() but for the other swapped task.
>> I see. For task swapping in task_numa_compare(),
>> it is triggered when there are no idle CPUs in task A's
>> preferred node.
>> In this case, we choose a task B on A's preferred node,
>> and swap B with A. This helps improve A's Numa locality
>> without introducing the load imbalance between Nodes.
>>
Hi Chenyu

There are two problems here:
1. Many kthreads are pinned, with all the efforts in task_numa_compare()
and task_numa_find_cpu(), the swapping may not end up happening. I only see a
check on source task: cpumask_test_cpu(cpu, env->p->cpus_ptr) but not dst task.
2. Assuming B is migratable, that can potentially make B worse, right? I think
some kthreads are quite cache-sensitive, and we swap like their locality doesn't
matter.

Ideally we probably just want to stay off kthreads, if we cannot find any others
p->mm tasks, just don't swap (?). That sounds like a brand new patch though.



Libo 
>> But B's Numa node preference is not mandatory in
>> current implementation IIUC, because B's load is mainly
> 
> hmm, that's doesn't seem to be right, can we choose B that
> is not a kthread from A's preferred node?
> 
>> considered. That is to say, is it legit to swap a
>> Numa sensitive task A with a non-Numa sensitive kernel
>> thread B? If not, I think we can add kernel thread
>> check in task swap like the guard in
>> task_tick_numa()/task_numa_fault().
>>
> 
> 
>> thanks,
>> Chenyu
>>
>>>
>>> Michal
>>
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 21:32           ` Libo Chen
@ 2025-05-05 21:57             ` Libo Chen
  2025-05-06  5:06               ` Jain, Ayush
  2025-05-06  5:36               ` Chen, Yu C
  0 siblings, 2 replies; 15+ messages in thread
From: Libo Chen @ 2025-05-05 21:57 UTC (permalink / raw)
  To: Chen, Yu C, Michal Koutný
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra



On 5/5/25 14:32, Libo Chen wrote:
> 
> 
> On 5/5/25 11:49, Libo Chen wrote:
>>
>>
>> On 5/5/25 11:27, Chen, Yu C wrote:
>>> Hi Michal,
>>>
>>> On 5/6/2025 1:46 AM, Michal Koutný wrote:
>>>> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>>>>> According to this address,
>>>>>     4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>>>>     49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>>>>> It seems that this task to be swapped has NULL mm_struct.
>>>>
>>>> So it's likely a kernel thread. Does it make sense to NUMA balance
>>>> those? (I naïvely think it doesn't, please correct me.) ...
>>>>
>>>
>>> I agree kernel threads are not supposed to be covered by
>>> NUMA balance, because currently NUMA balance only considers
>>> user pages via VMAs, and one question below:
>>>
>>>>>   static void __migrate_swap_task(struct task_struct *p, int cpu)
>>>>>   {
>>>>>          __schedstat_inc(p->stats.numa_task_swapped);
>>>>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>> +       if (p->mm)
>>>>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>
>>>> ... proper fix should likely guard this earlier, like the guard in
>>>> task_numa_fault() but for the other swapped task.
>>> I see. For task swapping in task_numa_compare(),
>>> it is triggered when there are no idle CPUs in task A's
>>> preferred node.
>>> In this case, we choose a task B on A's preferred node,
>>> and swap B with A. This helps improve A's Numa locality
>>> without introducing the load imbalance between Nodes.
>>>
> Hi Chenyu
> 
> There are two problems here:
> 1. Many kthreads are pinned, with all the efforts in task_numa_compare()
> and task_numa_find_cpu(), the swapping may not end up happening. I only see a
> check on source task: cpumask_test_cpu(cpu, env->p->cpus_ptr) but not dst task.

NVM I was blind. There is a check on dst task in task_numa_compare()

> 2. Assuming B is migratable, that can potentially make B worse, right? I think
> some kthreads are quite cache-sensitive, and we swap like their locality doesn't
> matter.
> 
> Ideally we probably just want to stay off kthreads, if we cannot find any others
> p->mm tasks, just don't swap (?). That sounds like a brand new patch though.
> 

A change as simple as that should work:

@@ -2492,7 +2492,7 @@ static bool task_numa_compare(struct task_numa_env *env,

        rcu_read_lock();
        cur = rcu_dereference(dst_rq->curr);
-       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
+       if (cur && ((cur->flags & PF_EXITING) || !cur->mm || is_idle_task(cur)))
                cur = NULL;
                                                                                                   
> 
> 
> Libo 
>>> But B's Numa node preference is not mandatory in
>>> current implementation IIUC, because B's load is mainly
>>
>> hmm, that's doesn't seem to be right, can we choose B that
>> is not a kthread from A's preferred node?
>>
>>> considered. That is to say, is it legit to swap a
>>> Numa sensitive task A with a non-Numa sensitive kernel
>>> thread B? If not, I think we can add kernel thread
>>> check in task swap like the guard in
>>> task_tick_numa()/task_numa_fault().
>>>
>>
>>
>>> thanks,
>>> Chenyu
>>>
>>>>
>>>> Michal
>>>
>>
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 21:57             ` Libo Chen
@ 2025-05-06  5:06               ` Jain, Ayush
  2025-05-06  5:36               ` Chen, Yu C
  1 sibling, 0 replies; 15+ messages in thread
From: Jain, Ayush @ 2025-05-06  5:06 UTC (permalink / raw)
  To: Libo Chen, Chen, Yu C, Michal Koutný
  Cc: Andrew Morton, Ingo Molnar, Tejun Heo, Johannes Weiner,
	Jonathan Corbet, Mel Gorman, Michal Hocko, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, cgroups,
	linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy, Neeraj.Upadhyay, Peter Zijlstra



On 5/6/2025 3:27 AM, Libo Chen wrote:
> 
> 
> On 5/5/25 14:32, Libo Chen wrote:
>>
>>
>> On 5/5/25 11:49, Libo Chen wrote:
>>>
>>>
>>> On 5/5/25 11:27, Chen, Yu C wrote:
>>>> Hi Michal,
>>>>
>>>> On 5/6/2025 1:46 AM, Michal Koutný wrote:
>>>>> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>>>>>> According to this address,
>>>>>>     4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>>>>>     49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>>>>>> It seems that this task to be swapped has NULL mm_struct.
>>>>>
>>>>> So it's likely a kernel thread. Does it make sense to NUMA balance
>>>>> those? (I naïvely think it doesn't, please correct me.) ...
>>>>>
>>>>
>>>> I agree kernel threads are not supposed to be covered by
>>>> NUMA balance, because currently NUMA balance only considers
>>>> user pages via VMAs, and one question below:
>>>>
>>>>>>   static void __migrate_swap_task(struct task_struct *p, int cpu)
>>>>>>   {
>>>>>>          __schedstat_inc(p->stats.numa_task_swapped);
>>>>>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>>> +       if (p->mm)
>>>>>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>>
>>>>> ... proper fix should likely guard this earlier, like the guard in
>>>>> task_numa_fault() but for the other swapped task.
>>>> I see. For task swapping in task_numa_compare(),
>>>> it is triggered when there are no idle CPUs in task A's
>>>> preferred node.
>>>> In this case, we choose a task B on A's preferred node,
>>>> and swap B with A. This helps improve A's Numa locality
>>>> without introducing the load imbalance between Nodes.
>>>>
>> Hi Chenyu
>>
>> There are two problems here:
>> 1. Many kthreads are pinned, with all the efforts in task_numa_compare()
>> and task_numa_find_cpu(), the swapping may not end up happening. I only see a
>> check on source task: cpumask_test_cpu(cpu, env->p->cpus_ptr) but not dst task.
> 
> NVM I was blind. There is a check on dst task in task_numa_compare()
> 
>> 2. Assuming B is migratable, that can potentially make B worse, right? I think
>> some kthreads are quite cache-sensitive, and we swap like their locality doesn't
>> matter.
>>
>> Ideally we probably just want to stay off kthreads, if we cannot find any others
>> p->mm tasks, just don't swap (?). That sounds like a brand new patch though.
>>
> 
> A change as simple as that should work:
> 
> @@ -2492,7 +2492,7 @@ static bool task_numa_compare(struct task_numa_env *env,
> 
>         rcu_read_lock();
>         cur = rcu_dereference(dst_rq->curr);
> -       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
> +       if (cur && ((cur->flags & PF_EXITING) || !cur->mm || is_idle_task(cur)))
>                 cur = NULL;
>

This fixes reported regression.

Tested-by: Ayush Jain <Ayush.jain3@amd.com>

Thanks,
Ayush


>>
>>
>> Libo 
>>>> But B's Numa node preference is not mandatory in
>>>> current implementation IIUC, because B's load is mainly
>>>
>>> hmm, that's doesn't seem to be right, can we choose B that
>>> is not a kthread from A's preferred node?
>>>
>>>> considered. That is to say, is it legit to swap a
>>>> Numa sensitive task A with a non-Numa sensitive kernel
>>>> thread B? If not, I think we can add kernel thread
>>>> check in task swap like the guard in
>>>> task_tick_numa()/task_numa_fault().
>>>>
>>>
>>>
>>>> thanks,
>>>> Chenyu
>>>>
>>>>>
>>>>> Michal
>>>>
>>>
>>
> 



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 21:57             ` Libo Chen
  2025-05-06  5:06               ` Jain, Ayush
@ 2025-05-06  5:36               ` Chen, Yu C
  2025-05-06  7:03                 ` Libo Chen
  1 sibling, 1 reply; 15+ messages in thread
From: Chen, Yu C @ 2025-05-06  5:36 UTC (permalink / raw)
  To: Libo Chen
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra, Michal Koutný

On 5/6/2025 5:57 AM, Libo Chen wrote:
> 
> 
> On 5/5/25 14:32, Libo Chen wrote:
>>
>>
>> On 5/5/25 11:49, Libo Chen wrote:
>>>
>>>
>>> On 5/5/25 11:27, Chen, Yu C wrote:
>>>> Hi Michal,
>>>>
>>>> On 5/6/2025 1:46 AM, Michal Koutný wrote:
>>>>> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>>>>>> According to this address,
>>>>>>      4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>>>>>      49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>>>>>> It seems that this task to be swapped has NULL mm_struct.
>>>>>
>>>>> So it's likely a kernel thread. Does it make sense to NUMA balance
>>>>> those? (I naïvely think it doesn't, please correct me.) ...
>>>>>
>>>>
>>>> I agree kernel threads are not supposed to be covered by
>>>> NUMA balance, because currently NUMA balance only considers
>>>> user pages via VMAs, and one question below:
>>>>
>>>>>>    static void __migrate_swap_task(struct task_struct *p, int cpu)
>>>>>>    {
>>>>>>           __schedstat_inc(p->stats.numa_task_swapped);
>>>>>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>>> +       if (p->mm)
>>>>>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>>
>>>>> ... proper fix should likely guard this earlier, like the guard in
>>>>> task_numa_fault() but for the other swapped task.
>>>> I see. For task swapping in task_numa_compare(),
>>>> it is triggered when there are no idle CPUs in task A's
>>>> preferred node.
>>>> In this case, we choose a task B on A's preferred node,
>>>> and swap B with A. This helps improve A's Numa locality
>>>> without introducing the load imbalance between Nodes.
>>>>
>> Hi Chenyu
>>
>> There are two problems here:
>> 1. Many kthreads are pinned, with all the efforts in task_numa_compare()
>> and task_numa_find_cpu(), the swapping may not end up happening. I only see a
>> check on source task: cpumask_test_cpu(cpu, env->p->cpus_ptr) but not dst task.
> 
> NVM I was blind. There is a check on dst task in task_numa_compare()
> 
>> 2. Assuming B is migratable, that can potentially make B worse, right? I think
>> some kthreads are quite cache-sensitive, and we swap like their locality doesn't
>> matter.

This makes sense. I wonder if it could be extended beyond kthreads.
We don't want to swap task B that has no explicit NUMA preference,
do we?

>>
>> Ideally we probably just want to stay off kthreads, if we cannot find any others
>> p->mm tasks, just don't swap (?). That sounds like a brand new patch though.
>>
> 
> A change as simple as that should work:
> 
> @@ -2492,7 +2492,7 @@ static bool task_numa_compare(struct task_numa_env *env,
> 
>          rcu_read_lock();
>          cur = rcu_dereference(dst_rq->curr);
> -       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
> +       if (cur && ((cur->flags & PF_EXITING) || !cur->mm || is_idle_task(cur)))

something like
if (cur && ((cur->flags & PF_EXITING) ||
     cur->numa_preferred_nid == NUMA_NO_NODE ||
    !cur->numa_faults || is_idle_task(cur)))

But overall it looks good to me, would you like to post this as a
formal patch, or do you want me to fold your change into a patch set?

thanks,
Chenyu

>                  cur = NULL;
>


  

>>
>>
>> Libo
>>>> But B's Numa node preference is not mandatory in
>>>> current implementation IIUC, because B's load is mainly
>>>
>>> hmm, that's doesn't seem to be right, can we choose B that
>>> is not a kthread from A's preferred node?
>>>
>>>> considered. That is to say, is it legit to swap a
>>>> Numa sensitive task A with a non-Numa sensitive kernel
>>>> thread B? If not, I think we can add kernel thread
>>>> check in task swap like the guard in
>>>> task_tick_numa()/task_numa_fault().
>>>>
>>>
>>>
>>>> thanks,
>>>> Chenyu
>>>>
>>>>>
>>>>> Michal
>>>>
>>>
>>
> 


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-06  5:36               ` Chen, Yu C
@ 2025-05-06  7:03                 ` Libo Chen
  0 siblings, 0 replies; 15+ messages in thread
From: Libo Chen @ 2025-05-06  7:03 UTC (permalink / raw)
  To: Chen, Yu C
  Cc: Jain, Ayush, Andrew Morton, Ingo Molnar, Tejun Heo,
	Johannes Weiner, Jonathan Corbet, Mel Gorman, Michal Hocko,
	Muchun Song, Roman Gushchin, Shakeel Butt, Chen, Tim C,
	Aubrey Li, cgroups, linux-doc, linux-mm, linux-kernel,
	K Prateek Nayak, Madadi Vineeth Reddy, Neeraj.Upadhyay,
	Peter Zijlstra, Michal Koutný



On 5/5/25 22:36, Chen, Yu C wrote:
> On 5/6/2025 5:57 AM, Libo Chen wrote:
>>
>>
>> On 5/5/25 14:32, Libo Chen wrote:
>>>
>>>
>>> On 5/5/25 11:49, Libo Chen wrote:
>>>>
>>>>
>>>> On 5/5/25 11:27, Chen, Yu C wrote:
>>>>> Hi Michal,
>>>>>
>>>>> On 5/6/2025 1:46 AM, Michal Koutný wrote:
>>>>>> On Mon, May 05, 2025 at 11:03:10PM +0800, "Chen, Yu C" <yu.c.chen@intel.com> wrote:
>>>>>>> According to this address,
>>>>>>>      4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>>>>>>      49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>>>>>>> It seems that this task to be swapped has NULL mm_struct.
>>>>>>
>>>>>> So it's likely a kernel thread. Does it make sense to NUMA balance
>>>>>> those? (I naïvely think it doesn't, please correct me.) ...
>>>>>>
>>>>>
>>>>> I agree kernel threads are not supposed to be covered by
>>>>> NUMA balance, because currently NUMA balance only considers
>>>>> user pages via VMAs, and one question below:
>>>>>
>>>>>>>    static void __migrate_swap_task(struct task_struct *p, int cpu)
>>>>>>>    {
>>>>>>>           __schedstat_inc(p->stats.numa_task_swapped);
>>>>>>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>>>> +       if (p->mm)
>>>>>>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>>>>>
>>>>>> ... proper fix should likely guard this earlier, like the guard in
>>>>>> task_numa_fault() but for the other swapped task.
>>>>> I see. For task swapping in task_numa_compare(),
>>>>> it is triggered when there are no idle CPUs in task A's
>>>>> preferred node.
>>>>> In this case, we choose a task B on A's preferred node,
>>>>> and swap B with A. This helps improve A's Numa locality
>>>>> without introducing the load imbalance between Nodes.
>>>>>
>>> Hi Chenyu
>>>
>>> There are two problems here:
>>> 1. Many kthreads are pinned, with all the efforts in task_numa_compare()
>>> and task_numa_find_cpu(), the swapping may not end up happening. I only see a
>>> check on source task: cpumask_test_cpu(cpu, env->p->cpus_ptr) but not dst task.
>>
>> NVM I was blind. There is a check on dst task in task_numa_compare()
>>
>>> 2. Assuming B is migratable, that can potentially make B worse, right? I think
>>> some kthreads are quite cache-sensitive, and we swap like their locality doesn't
>>> matter.
> 
> This makes sense. I wonder if it could be extended beyond kthreads.
> We don't want to swap task B that has no explicit NUMA preference,
> do we?
> 

I agree, at least that should be the default behavior.

>>>
>>> Ideally we probably just want to stay off kthreads, if we cannot find any others
>>> p->mm tasks, just don't swap (?). That sounds like a brand new patch though.
>>>
>>
>> A change as simple as that should work:
>>
>> @@ -2492,7 +2492,7 @@ static bool task_numa_compare(struct task_numa_env *env,
>>
>>          rcu_read_lock();
>>          cur = rcu_dereference(dst_rq->curr);
>> -       if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
>> +       if (cur && ((cur->flags & PF_EXITING) || !cur->mm || is_idle_task(cur)))
> 
> something like
> if (cur && ((cur->flags & PF_EXITING) ||
>     cur->numa_preferred_nid == NUMA_NO_NODE ||
>    !cur->numa_faults || is_idle_task(cur)))
> 

This implicitly skips kthreads, probably need some comment. Otherwise LGTM

> But overall it looks good to me, would you like to post this as a
> formal patch, or do you want me to fold your change into a patch set?
> 

You can fold it into one set.

Thanks,
Libo

> thanks,
> Chenyu
> 
>>                  cur = NULL;
>>
> 
> 
>  
> 
>>>
>>>
>>> Libo
>>>>> But B's Numa node preference is not mandatory in
>>>>> current implementation IIUC, because B's load is mainly
>>>>
>>>> hmm, that's doesn't seem to be right, can we choose B that
>>>> is not a kthread from A's preferred node?
>>>>
>>>>> considered. That is to say, is it legit to swap a
>>>>> Numa sensitive task A with a non-Numa sensitive kernel
>>>>> thread B? If not, I think we can add kernel thread
>>>>> check in task swap like the guard in
>>>>> task_tick_numa()/task_numa_fault().
>>>>>
>>>>
>>>>
>>>>> thanks,
>>>>> Chenyu
>>>>>
>>>>>>
>>>>>> Michal
>>>>>
>>>>
>>>
>>



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH v3] sched/numa: add statistics of numa balance task migration
  2025-05-05 17:25     ` Venkat Rao Bagalkote
@ 2025-05-07 11:36       ` Chen, Yu C
  0 siblings, 0 replies; 15+ messages in thread
From: Chen, Yu C @ 2025-05-07 11:36 UTC (permalink / raw)
  To: Venkat Rao Bagalkote
  Cc: Ingo Molnar, Tejun Heo, Johannes Weiner, Jonathan Corbet,
	Mel Gorman, Michal Hocko, Michal Koutny, Muchun Song,
	Roman Gushchin, Shakeel Butt, Chen, Tim C, Aubrey Li, Libo Chen,
	cgroups, linux-doc, linux-mm, linux-kernel, K Prateek Nayak,
	Madadi Vineeth Reddy, Neeraj.Upadhyay, Peter Zijlstra,
	Madhavan Srinivasan, Jain, Ayush, Andrew Morton


Hi Venkat,

On 5/6/2025 1:25 AM, Venkat Rao Bagalkote wrote:
> 
> On 05/05/25 8:33 pm, Chen, Yu C wrote:
>> On 5/5/2025 2:43 PM, Jain, Ayush wrote:
>>>
>>> Hello,
>>>
>>> Hitting Kernel Panic on latest-next while running rcutorture tests
>>>
>>> 37ff6e9a2ce3 ("Add linux-next specific files for 20250502")
>>>
>>> reverting this patch fixes it
>>> 3b2339eeb032 ("sched-numa-add-statistics-of-numa-balance-task- 
>>> migration-v3")
>>> https://web.git.kernel.org/pub/scm/linux/kernel/git/next/linux- 
>>> next.git/commit/kernel/sched/core.c? 
>>> id=3b2339eeb032627e9329daf70a4ba8cd62c9cc8d
>>>
>>> by looking at RIP pointer
>>>
>>> $ ./scripts/faddr2line vmlinux __migrate_swap_task+0x2e/0x180
>>> __migrate_swap_task+0x2e/0x180:
>>> count_memcg_events_mm at include/linux/memcontrol.h:987
>>> (inlined by) count_memcg_events_mm at include/linux/memcontrol.h:978
>>> (inlined by) __migrate_swap_task at kernel/sched/core.c:3356
>>>
>>> memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
>>> mm->owner -> NULL
>>>
>>> Attaching kernel logs below:
>>>
>>> [ 1070.635450] rcu-torture: rcu_torture_read_exit: End of episode
>>> [ 1074.047617] BUG: kernel NULL pointer dereference, address:
>>> 0000000000000498
>>
>> Thanks Ayush,
>>
>> According to this address,
>>    4c 8b af 50 09 00 00    mov    0x950(%rdi),%r13  <--- r13 = p->mm;
>>    49 8b bd 98 04 00 00    mov    0x498(%r13),%rdi  <--- p->mm->owner
>> It seems that this task to be swapped has NULL mm_struct.
>>
>> Does the following help?
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 96db6947bc92..0cb8cc4d551d 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -3353,7 +3353,8 @@ void set_task_cpu(struct task_struct *p, 
>> unsigned int new_cpu)
>>  static void __migrate_swap_task(struct task_struct *p, int cpu)
>>  {
>>         __schedstat_inc(p->stats.numa_task_swapped);
>> -       count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>> +       if (p->mm)
>> +               count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
>>
>>         if (task_on_rq_queued(p)) {
>>                 struct rq *src_rq, *dst_rq;
>>
> 
> Hello Chenyu,
> 
> 
> This issue is reported even on IBM Power servers.
> 
> Proposed fix works fine. Hence,
> 
> 
> Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
> 

Thank you for the test. May I know if you could help
check if v4 works without any NULL pointer exception?

https://lore.kernel.org/lkml/cover.1746611892.git.yu.c.chen@intel.com/

thanks,
Chenyu


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2025-05-07 11:36 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-30 10:36 [PATCH v3] sched/numa: add statistics of numa balance task migration Chen Yu
2025-05-01  7:00 ` Libo Chen
2025-05-02  9:30   ` Chen, Yu C
2025-05-05  6:43 ` Jain, Ayush
2025-05-05 15:03   ` Chen, Yu C
2025-05-05 17:25     ` Venkat Rao Bagalkote
2025-05-07 11:36       ` Chen, Yu C
2025-05-05 17:46     ` Michal Koutný
2025-05-05 18:27       ` Chen, Yu C
2025-05-05 18:49         ` Libo Chen
2025-05-05 21:32           ` Libo Chen
2025-05-05 21:57             ` Libo Chen
2025-05-06  5:06               ` Jain, Ayush
2025-05-06  5:36               ` Chen, Yu C
2025-05-06  7:03                 ` Libo Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox