linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
To: Davidlohr Bueso <dave@stgolabs.net>, Oleg Nesterov <oleg@redhat.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization
Date: Wed, 25 Mar 2015 12:21:46 +0300	[thread overview]
Message-ID: <55127E2A.4040204@yandex-team.ru> (raw)
In-Reply-To: <1427247055.2412.23.camel@stgolabs.net>

On 25.03.2015 04:30, Davidlohr Bueso wrote:
> Oleg cleverly suggested using xchg() to set the new
> mm->exe_file instead of calling set_mm_exe_file()
> which requires some form of serialization -- mmap_sem
> in this case. For archs that do not have atomic rmw
> instructions we still fallback to a spinlock alternative,
> so this should always be safe.  As such, we only need the
> mmap_sem for looking up the backing vm_file, which can be
> done sharing the lock. Naturally, this means we need to
> manually deal with both the new and old file reference
> counting, and we need not worry about the MMF_EXE_FILE_CHANGED
> bits, which can probably be deleted in the future anyway.
>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
> ---
> Changes from v1:
> 	- use rcu_dereference_raw()
> 	- comment lockless use in flush_old_exec()
>
>   fs/exec.c     |  6 ++++++
>   kernel/fork.c | 19 +++++++++++++------
>   kernel/sys.c  | 43 ++++++++++++++++++++++++++-----------------
>   3 files changed, 45 insertions(+), 23 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 314e8d8..02bfd98 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1082,7 +1082,13 @@ int flush_old_exec(struct linux_binprm * bprm)
>   	if (retval)
>   		goto out;
>
> +	/*
> +	 * Must be called _before_ exec_mmap() as bprm->mm is
> +	 * not visible until then. This also enables the update
> +	 * to be lockless.
> +	 */
>   	set_mm_exe_file(bprm->mm, bprm->file);
> +
>   	/*
>   	 * Release all of the old mmap stuff
>   	 */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3847f34..347f69c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -688,15 +688,22 @@ EXPORT_SYMBOL_GPL(mmput);
>    *
>    * This changes mm's executale file (shown as symlink /proc/[pid]/exe).
>    *
> - * Main users are mmput(), sys_execve() and sys_prctl(PR_SET_MM_MAP/EXE_FILE).
> - * Callers prevent concurrent invocations: in mmput() nobody alive left,
> - * in execve task is single-threaded, prctl holds mmap_sem exclusively.
> + * Main users are mmput() and sys_execve(). Callers prevent concurrent
> + * invocations: in mmput() nobody alive left, in execve task is single
> + * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the
> + * mm->exe_file, but does so without using set_mm_exe_file() in order
> + * to do avoid the need for any locks.
>    */
>   void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
>   {
> -	struct file *old_exe_file = rcu_dereference_protected(mm->exe_file,
> -			!atomic_read(&mm->mm_users) || current->in_execve ||
> -			lockdep_is_held(&mm->mmap_sem));
> +	struct file *old_exe_file;
> +
> +	/*
> +	 * It is safe to dereference the exe_file without RCU as
> +	 * this function is only called if nobody else can access
> +	 * this mm -- see comment above for justification.
> +	 */
> +	old_exe_file = rcu_dereference_raw(mm->exe_file);
>
>   	if (new_exe_file)
>   		get_file(new_exe_file);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 3be3449..1da6b17 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1649,14 +1649,13 @@ SYSCALL_DEFINE1(umask, int, mask)
>   	return mask;
>   }
>
> -static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
> +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
>   {
>   	struct fd exe;
> +	struct file *old_exe, *exe_file;
>   	struct inode *inode;
>   	int err;
>
> -	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
> -
>   	exe = fdget(fd);
>   	if (!exe.file)
>   		return -EBADF;
> @@ -1680,15 +1679,22 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
>   	/*
>   	 * Forbid mm->exe_file change if old file still mapped.
>   	 */
> +	exe_file = get_mm_exe_file(mm);
>   	err = -EBUSY;
>   	if (mm->exe_file) {

exe_file not mm->exe_file

>   		struct vm_area_struct *vma;
>
> -		for (vma = mm->mmap; vma; vma = vma->vm_next)
> -			if (vma->vm_file &&
> -			    path_equal(&vma->vm_file->f_path,
> +		down_read(&mm->mmap_sem);
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (!vma->vm_file)
> +				continue;
> +			if (path_equal(&vma->vm_file->f_path,
>   				       &mm->exe_file->f_path))

same here

> -				goto exit;
> +				goto exit_err;
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +		fput(exe_file);
>   	}
>
>   	/*
> @@ -1702,10 +1708,18 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
>   		goto exit;
>
>   	err = 0;
> -	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
> +	/* set the new file, lockless */
> +	get_file(exe.file);
> +	old_exe = xchg(&mm->exe_file, exe.file);
> +	if (old_exe)
> +		fput(old_exe);
>   exit:
>   	fdput(exe);
>   	return err;
> +exit_err:
> +	up_read(&mm->mmap_sem);
> +	fput(exe_file);
> +	goto exit;
>   }
>
>   #ifdef CONFIG_CHECKPOINT_RESTORE
> @@ -1840,10 +1854,9 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
>   		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
>   	}
>
> -	down_write(&mm->mmap_sem);
>   	if (prctl_map.exe_fd != (u32)-1)
> -		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
> -	downgrade_write(&mm->mmap_sem);
> +		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
> +	down_read(&mm->mmap_sem);
>   	if (error)
>   		goto out;
>
> @@ -1909,12 +1922,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
>   	if (!capable(CAP_SYS_RESOURCE))
>   		return -EPERM;
>
> -	if (opt == PR_SET_MM_EXE_FILE) {
> -		down_write(&mm->mmap_sem);
> -		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
> -		up_write(&mm->mmap_sem);
> -		return error;
> -	}
> +	if (opt == PR_SET_MM_EXE_FILE)
> +		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
>
>   	if (addr >= TASK_SIZE || addr < mmap_min_addr)
>   		return -EINVAL;
>


-- 
Konstantin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2015-03-25  9:21 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-03-20 14:47 [PATCH] mm: fix lockdep build in rcu-protected get_mm_exe_file() Konstantin Khlebnikov
2015-03-23 18:11 ` Davidlohr Bueso
2015-03-23 19:10   ` Oleg Nesterov
2015-03-24 14:38     ` Davidlohr Bueso
2015-03-24 17:13     ` Konstantin Khlebnikov
2015-03-24 18:10       ` Oleg Nesterov
2015-03-24 18:15         ` Konstantin Khlebnikov
2015-03-24 19:02           ` Oleg Nesterov
2015-03-25  1:30             ` [PATCH v2] prctl: avoid using mmap_sem for exe_file serialization Davidlohr Bueso
2015-03-25  9:21               ` Konstantin Khlebnikov [this message]
2015-03-25 10:42                 ` [PATCH v3] " Davidlohr Bueso
2015-03-25 11:08                   ` Konstantin Khlebnikov
2015-03-25 12:50                     ` Davidlohr Bueso
2015-03-25 12:53                   ` Oleg Nesterov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=55127E2A.4040204@yandex-team.ru \
    --to=khlebnikov@yandex-team.ru \
    --cc=akpm@linux-foundation.org \
    --cc=dave@stgolabs.net \
    --cc=koct9i@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=oleg@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox