On 2023-12-11 10:56, Felix Kuehling wrote: > > On 2023-12-08 05:11, Christian König wrote: >> Am 07.12.23 um 20:14 schrieb Felix Kuehling: >>> >>> On 2023-12-05 17:20, Felix Kuehling wrote: >>>> Properly mark kfd_process->ef as __rcu and consistently access it with >>>> rcu_dereference_protected. >>>> >>>> Reported-by: kernel test robot >>>> Closes: >>>> https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/ >>>> Signed-off-by: Felix Kuehling >>> >>> ping. >>> >>> Christian, would you review this patch, please? >> >> Looks a bit suspicious, especially the rcu_dereference_protected() use. >> >> What is the static checker complaining about in the first place? > From > https://lore.kernel.org/oe-kbuild-all/202312052245.yFpBSgNH-lkp@intel.com/: > >>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: >>> sparse: incompatible types in comparison expression (different >>> address spaces):  >> >>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: > struct dma_fence [noderef] __rcu * >> > drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_process.c:1671:9: sparse: > struct dma_fence * ... >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > sparse: incompatible types in comparison expression (different address > spaces): >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: > sparse: struct dma_fence [noderef] __rcu * >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > struct dma_fence * >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > sparse: incompatible types in comparison expression (different address > spaces): >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: > sparse: struct dma_fence [noderef] __rcu * >> > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c:2765:36: sparse: > struct dma_fence * > > As far as I can tell, the reason is, that I'm using > dma_fence_get_rcu_safe and rcu_replace_pointer to get and update > kfd_process->ef, without annotating the fence pointers with __rcu. > This patch fixes that by marking kfd_process->ef as an __rcu pointer. > The only place that was dereferencing it directly was > kfd_process_wq_release, where I added rcu_dereference_protected. The > condition I'm using here is, that the process ref count is 0 at that > point, which means nobody else is referencing the process or this > fence pointer at the time. Hi Christian, We discussed offline that you think rcu_dereference_protected is not needed in the teardown function. After reading over rcupdate.h, I think a simpler alternative would be to use rcu_access_pointer, after a grace period to ensure there can be no more readers. Based on this comment in rcupdate.h: * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. The last sentence sounds like rcu_dereference_protected should also be OK, though. Either way, it sounds like I need to add a synchronize_rcu call in any case, before freeing the fence. Do you agree with this proposal? Regards,   Felix > > Regards, >   Felix > > >> >> Regards, >> Christian. >> >>> >>> Thanks, >>>   Felix >>> >>> >>> >>>> --- >>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       | 2 +- >>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++-- >>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            | 2 +- >>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c         | 6 ++++-- >>>>   4 files changed, 8 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> index f2e920734c98..20cb266dcedd 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>>> @@ -314,7 +314,7 @@ void >>>> amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); >>>>   int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, >>>> struct amdgpu_bo *bo); >>>>     int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, >>>> -                        struct dma_fence **ef); >>>> +                        struct dma_fence __rcu **ef); >>>>   int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device >>>> *adev, >>>>                             struct kfd_vm_fault_info *info); >>>>   int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device >>>> *adev, int fd, >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> index 7d91f99acb59..8ba6f6c8363d 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >>>> @@ -2806,7 +2806,7 @@ static void >>>> amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) >>>>       put_task_struct(usertask); >>>>   } >>>>   -static void replace_eviction_fence(struct dma_fence **ef, >>>> +static void replace_eviction_fence(struct dma_fence __rcu **ef, >>>>                      struct dma_fence *new_ef) >>>>   { >>>>       struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true >>>> @@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct >>>> dma_fence **ef, >>>>    * 7.  Add fence to all PD and PT BOs. >>>>    * 8.  Unreserve all BOs >>>>    */ >>>> -int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>>> dma_fence **ef) >>>> +int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct >>>> dma_fence __rcu **ef) >>>>   { >>>>       struct amdkfd_process_info *process_info = info; >>>>       struct amdgpu_vm *peer_vm; >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> index 45366b4ca976..5a24097a9f28 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> @@ -917,7 +917,7 @@ struct kfd_process { >>>>        * fence will be triggered during eviction and new one will >>>> be created >>>>        * during restore >>>>        */ >>>> -    struct dma_fence *ef; >>>> +    struct dma_fence __rcu *ef; >>>>         /* Work items for evicting and restoring BOs */ >>>>       struct delayed_work eviction_work; >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> index 71df51fcc1b0..14b11d61f8dd 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> @@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct >>>> work_struct *work) >>>>   { >>>>       struct kfd_process *p = container_of(work, struct kfd_process, >>>>                            release_work); >>>> +    struct dma_fence *ef = rcu_dereference_protected(p->ef, >>>> +        kref_read(&p->ref) == 0); >>>>         kfd_process_dequeue_from_all_devices(p); >>>>       pqm_uninit(&p->pqm); >>>> @@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct >>>> work_struct *work) >>>>        * destroyed. This allows any BOs to be freed without >>>>        * triggering pointless evictions or waiting for fences. >>>>        */ >>>> -    dma_fence_signal(p->ef); >>>> +    dma_fence_signal(ef); >>>>         kfd_process_remove_sysfs(p); >>>>   @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct >>>> work_struct *work) >>>>       svm_range_list_fini(p); >>>>         kfd_process_destroy_pdds(p); >>>> -    dma_fence_put(p->ef); >>>> +    dma_fence_put(ef); >>>>         kfd_event_free_process(p); >> >