linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: "Kuehling, Felix" <Felix.Kuehling@amd.com>
To: Jason Gunthorpe <jgg@mellanox.com>,
	"dri-devel@lists.freedesktop.org"
	<dri-devel@lists.freedesktop.org>, Ben Goz <ben.goz@amd.com>,
	Oded Gabbay <oded.gabbay@amd.com>
Cc: Christoph Hellwig <hch@infradead.org>,
	"amd-gfx@lists.freedesktop.org" <amd-gfx@lists.freedesktop.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>
Subject: Re: [PATCH hmm] drm/amdkfd: fix a use after free race with mmu_notififer unregister
Date: Sun, 4 Aug 2019 21:28:36 +0000	[thread overview]
Message-ID: <c59ebe8b-9b18-24b8-b02c-8ccaa7df4dc9@amd.com> (raw)
In-Reply-To: <20190802200705.GA10110@ziepe.ca>

On 2019-08-02 16:07, Jason Gunthorpe wrote:
> When using mmu_notififer_unregister_no_release() the caller must ensure
> there is a SRCU synchronize before the mn memory is freed, otherwise use
> after free races are possible, for instance:
>
>       CPU0                                      CPU1
>                                        invalidate_range_start
>                                           hlist_for_each_entry_rcu(..)
>   mmu_notifier_unregister_no_release(&p->mn)
>   kfree(mn)
>                                        if (mn->ops->invalidate_range_end)
>
> The error unwind in amdkfd misses the SRCU synchronization.
>
> amdkfd keeps the kfd_process around until the mm is released, so split the
> flow to fully initialize the kfd_process and register it for find_process,
> and with the notifier. Past this point the kfd_process does not need to be
> cleaned up as it is fully ready.
>
> The final failable step does a vm_mmap() and does not seem to impact the
> kfd_process global state. Since it also cannot be undone (and already has
> problems with undo if it internally fails), it has to be last.
>
> This way we don't have to try to unwind the mmu_notifier_register() and
> avoid the problem with the SRCU.
>
> Along the way this also fixes various other error unwind bugs in the flow.
>
> Fixes: 45102048f77e ("amdkfd: Add process queue manager module")
> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 74 +++++++++++-------------
>   1 file changed, 35 insertions(+), 39 deletions(-)
>
> amdkfd folks, this little bug is blocking some rework I have for the
> mmu notifiers (ie mm/mmu_notifiers: remove unregister_no_release)
>
> Can I get your help to review and if needed polish this change? I'd
> like to send this patch through the hmm tree along with the rework,
> thanks

Thanks. That's a nice cleanup of the error handling during KFD process 
creation. One nit-pick inline, otherwise this looks good to me.


>
> You can see the larger series here:
>
> https://github.com/jgunthorpe/linux/commits/mmu_notifier
>
> Jason
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 8f1076c0c88a25..81e3ee3f1813bf 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -62,8 +62,8 @@ static struct workqueue_struct *kfd_restore_wq;
>   
>   static struct kfd_process *find_process(const struct task_struct *thread);
>   static void kfd_process_ref_release(struct kref *ref);
> -static struct kfd_process *create_process(const struct task_struct *thread,
> -					struct file *filep);
> +static struct kfd_process *create_process(const struct task_struct *thread);
> +static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
>   
>   static void evict_process_worker(struct work_struct *work);
>   static void restore_process_worker(struct work_struct *work);
> @@ -289,7 +289,15 @@ struct kfd_process *kfd_create_process(struct file *filep)
>   	if (process) {
>   		pr_debug("Process already found\n");
>   	} else {
> -		process = create_process(thread, filep);
> +		process = create_process(thread);
> +		if (IS_ERR(process))
> +			goto out;
> +
> +		ret = kfd_process_init_cwsr_apu(process, filep);
> +		if (ret) {
> +			process = ERR_PTR(ret);
> +			goto out;
> +		}
>   
>   		if (!procfs.kobj)
>   			goto out;
> @@ -609,64 +617,56 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
>   	return 0;
>   }
>   
> -static struct kfd_process *create_process(const struct task_struct *thread,
> -					struct file *filep)
> +/*
> + * On return the kfd_process is fully operational and will be freed when the
> + * mm is released
> + */
> +static struct kfd_process *create_process(const struct task_struct *thread)
>   {
>   	struct kfd_process *process;
>   	int err = -ENOMEM;
>   
>   	process = kzalloc(sizeof(*process), GFP_KERNEL);
> -
>   	if (!process)
>   		goto err_alloc_process;
>   
> -	process->pasid = kfd_pasid_alloc();
> -	if (process->pasid == 0)
> -		goto err_alloc_pasid;
> -
> -	if (kfd_alloc_process_doorbells(process) < 0)
> -		goto err_alloc_doorbells;
> -
>   	kref_init(&process->ref);
> -
>   	mutex_init(&process->mutex);
> -
>   	process->mm = thread->mm;
> -
> -	/* register notifier */
> -	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
> -	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
> -	if (err)
> -		goto err_mmu_notifier;
> -
> -	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
> -			(uintptr_t)process->mm);
> -
>   	process->lead_thread = thread->group_leader;
> -	get_task_struct(process->lead_thread);
> -
>   	INIT_LIST_HEAD(&process->per_device_data);
> -
> +	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
> +	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
> +	process->last_restore_timestamp = get_jiffies_64();
>   	kfd_event_init_process(process);
> +	process->is_32bit_user_mode = in_compat_syscall();
> +
> +	process->pasid = kfd_pasid_alloc();
> +	if (process->pasid == 0)
> +		goto err_alloc_pasid;
> +
> +	if (kfd_alloc_process_doorbells(process) < 0)
> +		goto err_alloc_doorbells;
>   
>   	err = pqm_init(&process->pqm, process);
>   	if (err != 0)
>   		goto err_process_pqm_init;
>   
>   	/* init process apertures*/
> -	process->is_32bit_user_mode = in_compat_syscall();
>   	err = kfd_init_apertures(process);
>   	if (err != 0)
>   		goto err_init_apertures;
>   
> -	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
> -	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
> -	process->last_restore_timestamp = get_jiffies_64();
> -
> -	err = kfd_process_init_cwsr_apu(process, filep);
> +	/* Must be last, have to use release destruction after this */
> +	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
> +	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
>   	if (err)
>   		goto err_init_cwsr;

This label should be renamed to something like err_mmu_notifier. With 
that fixed this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>

>   
> +	get_task_struct(process->lead_thread);
> +	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
> +			(uintptr_t)process->mm);
> +
>   	return process;
>   
>   err_init_cwsr:
> @@ -675,15 +675,11 @@ static struct kfd_process *create_process(const struct task_struct *thread,
>   err_init_apertures:
>   	pqm_uninit(&process->pqm);
>   err_process_pqm_init:
> -	hash_del_rcu(&process->kfd_processes);
> -	synchronize_rcu();
> -	mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
> -err_mmu_notifier:
> -	mutex_destroy(&process->mutex);
>   	kfd_free_process_doorbells(process);
>   err_alloc_doorbells:
>   	kfd_pasid_free(process->pasid);
>   err_alloc_pasid:
> +	mutex_destroy(&process->mutex);
>   	kfree(process);
>   err_alloc_process:
>   	return ERR_PTR(err);

  reply	other threads:[~2019-08-04 21:28 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-02 20:07 Jason Gunthorpe
2019-08-04 21:28 ` Kuehling, Felix [this message]
2019-08-06  7:31   ` Christoph Hellwig
2019-08-06  8:33     ` Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c59ebe8b-9b18-24b8-b02c-8ccaa7df4dc9@amd.com \
    --to=felix.kuehling@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=ben.goz@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=hch@infradead.org \
    --cc=jgg@mellanox.com \
    --cc=linux-mm@kvack.org \
    --cc=oded.gabbay@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox