linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] process_vm_readv/writev: add flags for pidfd and nowait
@ 2025-11-18 13:23 Alban Crequy
  2025-11-20 13:16 ` Christian Brauner
  2025-12-02 10:45 ` David Hildenbrand (Red Hat)
  0 siblings, 2 replies; 3+ messages in thread
From: Alban Crequy @ 2025-11-18 13:23 UTC (permalink / raw)
  To: Andrew Morton, David Hildenbrand, Christian Brauner
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, linux-kernel,
	linux-mm, Alban Crequy, Alban Crequy, Peter Xu, Willy Tarreau,
	mfriese

From: Alban Crequy <albancrequy@microsoft.com>

- PROCESS_VM_PIDFD: refer to the remote process via PID file descriptor
  instead of PID. Such a file descriptor can be obtained with
  pidfd_open(2).

- PROCESS_VM_NOWAIT: do not block on IO if the memory access causes a
  page fault.

If a given flag is unsupported, the syscall returns the error EINVAL
without checking the buffers. This gives a way to userspace to detect
whether the current kernel supports a specific flag:

  process_vm_readv(pid, NULL, 1, NULL, 1, PROCESS_VM_PIDFD)
  -> EINVAL if the kernel does not support the flag PROCESS_VM_PIDFD
     (before this patch)
  -> EFAULT if the kernel supports the flag (after this patch)

Signed-off-by: Alban Crequy <albancrequy@microsoft.com>
---
 MAINTAINERS                     |  1 +
 include/uapi/linux/process_vm.h |  9 +++++++++
 mm/process_vm_access.c          | 20 +++++++++++++++-----
 3 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 include/uapi/linux/process_vm.h

diff --git a/MAINTAINERS b/MAINTAINERS
index e64b94e6b5a9..91b4647cf761 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16272,6 +16272,7 @@ F:	include/linux/pgtable.h
 F:	include/linux/ptdump.h
 F:	include/linux/vmpressure.h
 F:	include/linux/vmstat.h
+F:	include/uapi/linux/process_vm.h
 F:	kernel/fork.c
 F:	mm/Kconfig
 F:	mm/debug.c
diff --git a/include/uapi/linux/process_vm.h b/include/uapi/linux/process_vm.h
new file mode 100644
index 000000000000..4168e09f3f4e
--- /dev/null
+++ b/include/uapi/linux/process_vm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_PROCESS_VM_H
+#define _UAPI_LINUX_PROCESS_VM_H
+
+/* Flags for process_vm_readv/process_vm_writev */
+#define PROCESS_VM_PIDFD        (1UL << 0)
+#define PROCESS_VM_NOWAIT       (1UL << 1)
+
+#endif /* _UAPI_LINUX_PROCESS_VM_H */
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 656d3e88755b..b5eac870ef24 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
+#include <linux/process_vm.h>
 
 /**
  * process_vm_rw_pages - read/write pages from task specified
@@ -68,6 +69,7 @@ static int process_vm_rw_pages(struct page **pages,
  * @mm: mm for task
  * @task: task to read/write from
  * @vm_write: 0 means copy from, 1 means copy to
+ * @pvm_flags: PROCESS_VM_* flags
  * Returns 0 on success or on failure error code
  */
 static int process_vm_rw_single_vec(unsigned long addr,
@@ -76,7 +78,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
 				    struct page **process_pages,
 				    struct mm_struct *mm,
 				    struct task_struct *task,
-				    int vm_write)
+				    int vm_write,
+				    unsigned int pvm_flags)
 {
 	unsigned long pa = addr & PAGE_MASK;
 	unsigned long start_offset = addr - pa;
@@ -91,6 +94,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
 
 	if (vm_write)
 		flags |= FOLL_WRITE;
+	if (pvm_flags & PROCESS_VM_NOWAIT)
+		flags |= FOLL_NOWAIT;
 
 	while (!rc && nr_pages && iov_iter_count(iter)) {
 		int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES);
@@ -141,7 +146,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
  * @iter: where to copy to/from locally
  * @rvec: iovec array specifying where to copy to/from in the other process
  * @riovcnt: size of rvec array
- * @flags: currently unused
+ * @flags: process_vm_readv/writev flags
  * @vm_write: 0 if reading from other process, 1 if writing to other process
  *
  * Returns the number of bytes read/written or error code. May
@@ -163,6 +168,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 	unsigned long nr_pages_iov;
 	ssize_t iov_len;
 	size_t total_len = iov_iter_count(iter);
+	unsigned int f_flags;
 
 	/*
 	 * Work out how many pages of struct pages we're going to need
@@ -194,7 +200,11 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 	}
 
 	/* Get process information */
-	task = find_get_task_by_vpid(pid);
+	if (flags & PROCESS_VM_PIDFD)
+		task = pidfd_get_task(pid, &f_flags);
+	else
+		task = find_get_task_by_vpid(pid);
+
 	if (!task) {
 		rc = -ESRCH;
 		goto free_proc_pages;
@@ -215,7 +225,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 	for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
 		rc = process_vm_rw_single_vec(
 			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
-			iter, process_pages, mm, task, vm_write);
+			iter, process_pages, mm, task, vm_write, flags);
 
 	/* copied = space before - space after */
 	total_len -= iov_iter_count(iter);
@@ -266,7 +276,7 @@ static ssize_t process_vm_rw(pid_t pid,
 	ssize_t rc;
 	int dir = vm_write ? ITER_SOURCE : ITER_DEST;
 
-	if (flags != 0)
+	if (flags & ~(PROCESS_VM_NOWAIT | PROCESS_VM_PIDFD))
 		return -EINVAL;
 
 	/* Check iovecs */
-- 
2.45.0



^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] process_vm_readv/writev: add flags for pidfd and nowait
  2025-11-18 13:23 [PATCH] process_vm_readv/writev: add flags for pidfd and nowait Alban Crequy
@ 2025-11-20 13:16 ` Christian Brauner
  2025-12-02 10:45 ` David Hildenbrand (Red Hat)
  1 sibling, 0 replies; 3+ messages in thread
From: Christian Brauner @ 2025-11-20 13:16 UTC (permalink / raw)
  To: Alban Crequy, Oleg Nesterov
  Cc: Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
	Liam R . Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, linux-kernel, linux-mm,
	Alban Crequy, Peter Xu, Willy Tarreau, mfriese

On Tue, Nov 18, 2025 at 02:23:48PM +0100, Alban Crequy wrote:
> From: Alban Crequy <albancrequy@microsoft.com>
> 
> - PROCESS_VM_PIDFD: refer to the remote process via PID file descriptor
>   instead of PID. Such a file descriptor can be obtained with
>   pidfd_open(2).
> 
> - PROCESS_VM_NOWAIT: do not block on IO if the memory access causes a
>   page fault.
> 
> If a given flag is unsupported, the syscall returns the error EINVAL
> without checking the buffers. This gives a way to userspace to detect
> whether the current kernel supports a specific flag:
> 
>   process_vm_readv(pid, NULL, 1, NULL, 1, PROCESS_VM_PIDFD)
>   -> EINVAL if the kernel does not support the flag PROCESS_VM_PIDFD
>      (before this patch)
>   -> EFAULT if the kernel supports the flag (after this patch)
> 
> Signed-off-by: Alban Crequy <albancrequy@microsoft.com>
> ---

Looks good to me overall,
Reviewed-by: Christian Brauner <brauner@kernel.org>

I do want to note an inconsistency though we should address - unrelated
to this patchset.

pidfd_get_task() supports PIDFD_SELF_THREAD and PIDFD_SELF_THREAD_GROUP.
This means it allows to operate on an individual thread and not the
thread-group leader.

This is highly relevant for permission checking as stuff like
ptrace_may_access() (via mm_access()) takes the creds of the task into
account and individual threads may have different creds than the
thread-group leader.

So suddently process_madvise(), process_mrelease(), and with this patch
now also process_vm_readv() can operate on individual threads...

except that's not completely true as they only allow to operate on the
individual thread of the caller. Arbitrary non-constant pidfds will
enforce that it's a thread-group leader that was specified and otherwise
return NULL...

We should make this consistent:

(1) Enforce thread-group leader based operations only
(2) Support threads for arbitrary pidfds

Someone needs to tell me what is wanted or needed and then I can fix
pidfd_get_task() accordingly.

The current state is not actively harmful afaict but it's inconsistent.

>  MAINTAINERS                     |  1 +
>  include/uapi/linux/process_vm.h |  9 +++++++++
>  mm/process_vm_access.c          | 20 +++++++++++++++-----
>  3 files changed, 25 insertions(+), 5 deletions(-)
>  create mode 100644 include/uapi/linux/process_vm.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e64b94e6b5a9..91b4647cf761 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16272,6 +16272,7 @@ F:	include/linux/pgtable.h
>  F:	include/linux/ptdump.h
>  F:	include/linux/vmpressure.h
>  F:	include/linux/vmstat.h
> +F:	include/uapi/linux/process_vm.h
>  F:	kernel/fork.c
>  F:	mm/Kconfig
>  F:	mm/debug.c
> diff --git a/include/uapi/linux/process_vm.h b/include/uapi/linux/process_vm.h
> new file mode 100644
> index 000000000000..4168e09f3f4e
> --- /dev/null
> +++ b/include/uapi/linux/process_vm.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PROCESS_VM_H
> +#define _UAPI_LINUX_PROCESS_VM_H
> +
> +/* Flags for process_vm_readv/process_vm_writev */
> +#define PROCESS_VM_PIDFD        (1UL << 0)
> +#define PROCESS_VM_NOWAIT       (1UL << 1)
> +
> +#endif /* _UAPI_LINUX_PROCESS_VM_H */
> diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
> index 656d3e88755b..b5eac870ef24 100644
> --- a/mm/process_vm_access.c
> +++ b/mm/process_vm_access.c
> @@ -14,6 +14,7 @@
>  #include <linux/ptrace.h>
>  #include <linux/slab.h>
>  #include <linux/syscalls.h>
> +#include <linux/process_vm.h>
>  
>  /**
>   * process_vm_rw_pages - read/write pages from task specified
> @@ -68,6 +69,7 @@ static int process_vm_rw_pages(struct page **pages,
>   * @mm: mm for task
>   * @task: task to read/write from
>   * @vm_write: 0 means copy from, 1 means copy to
> + * @pvm_flags: PROCESS_VM_* flags
>   * Returns 0 on success or on failure error code
>   */
>  static int process_vm_rw_single_vec(unsigned long addr,
> @@ -76,7 +78,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
>  				    struct page **process_pages,
>  				    struct mm_struct *mm,
>  				    struct task_struct *task,
> -				    int vm_write)
> +				    int vm_write,
> +				    unsigned int pvm_flags)
>  {
>  	unsigned long pa = addr & PAGE_MASK;
>  	unsigned long start_offset = addr - pa;
> @@ -91,6 +94,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
>  
>  	if (vm_write)
>  		flags |= FOLL_WRITE;
> +	if (pvm_flags & PROCESS_VM_NOWAIT)
> +		flags |= FOLL_NOWAIT;
>  
>  	while (!rc && nr_pages && iov_iter_count(iter)) {
>  		int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES);
> @@ -141,7 +146,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
>   * @iter: where to copy to/from locally
>   * @rvec: iovec array specifying where to copy to/from in the other process
>   * @riovcnt: size of rvec array
> - * @flags: currently unused
> + * @flags: process_vm_readv/writev flags
>   * @vm_write: 0 if reading from other process, 1 if writing to other process
>   *
>   * Returns the number of bytes read/written or error code. May
> @@ -163,6 +168,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>  	unsigned long nr_pages_iov;
>  	ssize_t iov_len;
>  	size_t total_len = iov_iter_count(iter);
> +	unsigned int f_flags;
>  
>  	/*
>  	 * Work out how many pages of struct pages we're going to need
> @@ -194,7 +200,11 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>  	}
>  
>  	/* Get process information */
> -	task = find_get_task_by_vpid(pid);
> +	if (flags & PROCESS_VM_PIDFD)
> +		task = pidfd_get_task(pid, &f_flags);
> +	else
> +		task = find_get_task_by_vpid(pid);
> +
>  	if (!task) {
>  		rc = -ESRCH;
>  		goto free_proc_pages;
> @@ -215,7 +225,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>  	for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
>  		rc = process_vm_rw_single_vec(
>  			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
> -			iter, process_pages, mm, task, vm_write);
> +			iter, process_pages, mm, task, vm_write, flags);
>  
>  	/* copied = space before - space after */
>  	total_len -= iov_iter_count(iter);
> @@ -266,7 +276,7 @@ static ssize_t process_vm_rw(pid_t pid,
>  	ssize_t rc;
>  	int dir = vm_write ? ITER_SOURCE : ITER_DEST;
>  
> -	if (flags != 0)
> +	if (flags & ~(PROCESS_VM_NOWAIT | PROCESS_VM_PIDFD))
>  		return -EINVAL;
>  
>  	/* Check iovecs */
> -- 
> 2.45.0
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] process_vm_readv/writev: add flags for pidfd and nowait
  2025-11-18 13:23 [PATCH] process_vm_readv/writev: add flags for pidfd and nowait Alban Crequy
  2025-11-20 13:16 ` Christian Brauner
@ 2025-12-02 10:45 ` David Hildenbrand (Red Hat)
  1 sibling, 0 replies; 3+ messages in thread
From: David Hildenbrand (Red Hat) @ 2025-12-02 10:45 UTC (permalink / raw)
  To: Alban Crequy, Andrew Morton, Christian Brauner
  Cc: Lorenzo Stoakes, Liam R . Howlett, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, linux-kernel,
	linux-mm, Alban Crequy, Peter Xu, Willy Tarreau, mfriese

On 11/18/25 14:23, Alban Crequy wrote:
> From: Alban Crequy <albancrequy@microsoft.com>
> 

Hi,


subject: would probably better be something like:

  "mm/process_vm_access: pidfd and nowait support for process_vm_readv/writev"

> - PROCESS_VM_PIDFD: refer to the remote process via PID file descriptor
>    instead of PID. Such a file descriptor can be obtained with
>    pidfd_open(2).
> 
> - PROCESS_VM_NOWAIT: do not block on IO if the memory access causes a
>    page fault.
> 
> If a given flag is unsupported, the syscall returns the error EINVAL
> without checking the buffers. This gives a way to userspace to detect
> whether the current kernel supports a specific flag:
> 
>    process_vm_readv(pid, NULL, 1, NULL, 1, PROCESS_VM_PIDFD)
>    -> EINVAL if the kernel does not support the flag PROCESS_VM_PIDFD
>       (before this patch)
>    -> EFAULT if the kernel supports the flag (after this patch)
> 

You should not just describe what you are doing, but also why you are doing it.

What is the problem you are trying to solve in user space where you would
require these extensions?

> Signed-off-by: Alban Crequy <albancrequy@microsoft.com>
> ---
>   MAINTAINERS                     |  1 +
>   include/uapi/linux/process_vm.h |  9 +++++++++
>   mm/process_vm_access.c          | 20 +++++++++++++++-----
>   3 files changed, 25 insertions(+), 5 deletions(-)
>   create mode 100644 include/uapi/linux/process_vm.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e64b94e6b5a9..91b4647cf761 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16272,6 +16272,7 @@ F:	include/linux/pgtable.h
>   F:	include/linux/ptdump.h
>   F:	include/linux/vmpressure.h
>   F:	include/linux/vmstat.h
> +F:	include/uapi/linux/process_vm.h
>   F:	kernel/fork.c
>   F:	mm/Kconfig
>   F:	mm/debug.c
> diff --git a/include/uapi/linux/process_vm.h b/include/uapi/linux/process_vm.h
> new file mode 100644
> index 000000000000..4168e09f3f4e
> --- /dev/null
> +++ b/include/uapi/linux/process_vm.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PROCESS_VM_H
> +#define _UAPI_LINUX_PROCESS_VM_H
> +
> +/* Flags for process_vm_readv/process_vm_writev */
> +#define PROCESS_VM_PIDFD        (1UL << 0)
> +#define PROCESS_VM_NOWAIT       (1UL << 1)

Can we use BIT() here? I see it getting used in other uapi headers.

> +
> +#endif /* _UAPI_LINUX_PROCESS_VM_H */
> diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
> index 656d3e88755b..b5eac870ef24 100644
> --- a/mm/process_vm_access.c
> +++ b/mm/process_vm_access.c
> @@ -14,6 +14,7 @@
>   #include <linux/ptrace.h>
>   #include <linux/slab.h>
>   #include <linux/syscalls.h>
> +#include <linux/process_vm.h>
>   
>   /**
>    * process_vm_rw_pages - read/write pages from task specified
> @@ -68,6 +69,7 @@ static int process_vm_rw_pages(struct page **pages,
>    * @mm: mm for task
>    * @task: task to read/write from
>    * @vm_write: 0 means copy from, 1 means copy to
> + * @pvm_flags: PROCESS_VM_* flags
>    * Returns 0 on success or on failure error code
>    */
>   static int process_vm_rw_single_vec(unsigned long addr,
> @@ -76,7 +78,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
>   				    struct page **process_pages,
>   				    struct mm_struct *mm,
>   				    struct task_struct *task,
> -				    int vm_write)
> +				    int vm_write,
> +				    unsigned int pvm_flags)

"unsigned int" but you define the flags as unsigned long above. Applies to all
occurrences.

We ran into interesting issued with such inconsistency with clone-flags recently :)

>   {
>   	unsigned long pa = addr & PAGE_MASK;
>   	unsigned long start_offset = addr - pa;
> @@ -91,6 +94,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
>   
>   	if (vm_write)
>   		flags |= FOLL_WRITE;
> +	if (pvm_flags & PROCESS_VM_NOWAIT)
> +		flags |= FOLL_NOWAIT;
>   
>   	while (!rc && nr_pages && iov_iter_count(iter)) {
>   		int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES);
> @@ -141,7 +146,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
>    * @iter: where to copy to/from locally
>    * @rvec: iovec array specifying where to copy to/from in the other process
>    * @riovcnt: size of rvec array
> - * @flags: currently unused
> + * @flags: process_vm_readv/writev flags
>    * @vm_write: 0 if reading from other process, 1 if writing to other process
>    *
>    * Returns the number of bytes read/written or error code. May
> @@ -163,6 +168,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>   	unsigned long nr_pages_iov;
>   	ssize_t iov_len;
>   	size_t total_len = iov_iter_count(iter);
> +	unsigned int f_flags;
>   
>   	/*
>   	 * Work out how many pages of struct pages we're going to need
> @@ -194,7 +200,11 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>   	}
>   
>   	/* Get process information */
> -	task = find_get_task_by_vpid(pid);
> +	if (flags & PROCESS_VM_PIDFD)
> +		task = pidfd_get_task(pid, &f_flags);
> +	else
> +		task = find_get_task_by_vpid(pid);
> +
>   	if (!task) {
>   		rc = -ESRCH;
>   		goto free_proc_pages;
> @@ -215,7 +225,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
>   	for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
>   		rc = process_vm_rw_single_vec(
>   			(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
> -			iter, process_pages, mm, task, vm_write);
> +			iter, process_pages, mm, task, vm_write, flags);
>   
>   	/* copied = space before - space after */
>   	total_len -= iov_iter_count(iter);
> @@ -266,7 +276,7 @@ static ssize_t process_vm_rw(pid_t pid,
>   	ssize_t rc;
>   	int dir = vm_write ? ITER_SOURCE : ITER_DEST;
>   
> -	if (flags != 0)
> +	if (flags & ~(PROCESS_VM_NOWAIT | PROCESS_VM_PIDFD))

Likely we want that wrapped in a new define like

#define PROCESS_VM_SUPPORTED_FLAGS (PROCESS_VM_NOWAIT | PROCESS_VM_PIDFD)

(likely not in the uapi header but in a kernel-internal one)

-- 
Cheers

David


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-12-02 10:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-11-18 13:23 [PATCH] process_vm_readv/writev: add flags for pidfd and nowait Alban Crequy
2025-11-20 13:16 ` Christian Brauner
2025-12-02 10:45 ` David Hildenbrand (Red Hat)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox