linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Oren Laadan <orenl@librato.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@osdl.org>,
	containers@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-api@vger.kernel.org, Serge Hallyn <serue@us.ibm.com>,
	Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>,
	Matt Helsley <matthltc@us.ibm.com>
Subject: [PATCH v18 29/80] c/r: Save and restore the [compat_]robust_list member of the task struct
Date: Wed, 23 Sep 2009 19:51:09 -0400	[thread overview]
Message-ID: <1253749920-18673-30-git-send-email-orenl@librato.com> (raw)
In-Reply-To: <1253749920-18673-1-git-send-email-orenl@librato.com>

From: Matt Helsley <matthltc@us.ibm.com>

These lists record which futexes the task holds. To keep the overhead of
robust futexes low the list is kept in userspace. When the task exits the
kernel carefully walks these lists to recover held futexes that
other tasks may be attempting to acquire with FUTEX_WAIT.

Because they point to userspace memory that is saved/restored by
checkpoint/restart saving the list pointers themselves is safe.

While saving the pointers is safe during checkpoint, restart is tricky
because the robust futex ABI contains provisions for changes based on
checking the size of the list head. So we need to save the length of
the list head too in order to make sure that the kernel used during
restart is capable of handling that ABI. Since there is only one ABI
supported at the moment taking the list head's size is simple. Should
the ABI change we will need to use the same size as specified during
sys_set_robust_list() and hence some new means of determining the length
of this userspace structure in sys_checkpoint would be required.

Rather than rewrite the logic that checks and handles the ABI we reuse
sys_set_robust_list() by factoring out the body of the function and
calling it during restart.

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
[orenl@cs.columbia.edu: move save/restore code to checkpoint/process.c]
---
 checkpoint/process.c           |   49 ++++++++++++++++++++++++++++++++++++++++
 include/linux/checkpoint_hdr.h |    5 ++++
 include/linux/compat.h         |    3 +-
 include/linux/futex.h          |    1 +
 kernel/futex.c                 |   19 +++++++++-----
 kernel/futex_compat.c          |   13 ++++++++--
 6 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/checkpoint/process.c b/checkpoint/process.c
index 62ae72d..2580b31 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -14,10 +14,57 @@
 #include <linux/sched.h>
 #include <linux/posix-timers.h>
 #include <linux/futex.h>
+#include <linux/compat.h>
 #include <linux/poll.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 
+
+#ifdef CONFIG_FUTEX
+static void save_task_robust_futex_list(struct ckpt_hdr_task *h,
+					struct task_struct *t)
+{
+	/*
+	 * These are __user pointers and thus can be saved without
+	 * the objhash.
+	 */
+	h->robust_futex_list = (unsigned long)t->robust_list;
+	h->robust_futex_head_len = sizeof(*t->robust_list);
+#ifdef CONFIG_COMPAT
+	h->compat_robust_futex_list = ptr_to_compat(t->compat_robust_list);
+	h->compat_robust_futex_head_len = sizeof(*t->compat_robust_list);
+#endif
+}
+
+static void restore_task_robust_futex_list(struct ckpt_hdr_task *h)
+{
+	/* Since we restore the memory map the address remains the same and
+	 * this is safe. This is the same as [compat_]sys_set_robust_list() */
+	if (h->robust_futex_list) {
+		struct robust_list_head __user *rfl;
+		rfl = (void __user *)(unsigned long) h->robust_futex_list;
+		do_set_robust_list(rfl, h->robust_futex_head_len);
+	}
+#ifdef CONFIG_COMPAT
+	if (h->compat_robust_futex_list) {
+		struct compat_robust_list_head __user *crfl;
+		crfl = compat_ptr(h->compat_robust_futex_list);
+		do_compat_set_robust_list(crfl, h->compat_robust_futex_head_len);
+	}
+#endif
+}
+#else /* !CONFIG_FUTEX */
+static inline void save_task_robust_futex_list(struct ckpt_hdr_task *h,
+					       struct task_struct *t)
+{
+}
+
+static inline void restore_task_robust_futex_list(struct ckpt_hdr_task *h)
+{
+}
+#endif /* CONFIG_FUTEX */
+
+
 /***********************************************************************
  * Checkpoint
  */
@@ -46,6 +93,7 @@ static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 
 		h->set_child_tid = (unsigned long) t->set_child_tid;
 		h->clear_child_tid = (unsigned long) t->clear_child_tid;
+		save_task_robust_futex_list(h, t);
 	}
 
 	ret = ckpt_write_obj(ctx, &h->h);
@@ -250,6 +298,7 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 			(int __user *) (unsigned long) h->set_child_tid;
 		t->clear_child_tid =
 			(int __user *) (unsigned long) h->clear_child_tid;
+		restore_task_robust_futex_list(h);
 	}
 
 	memset(t->comm, 0, TASK_COMM_LEN);
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 8ae3bbe..b56fe71 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -136,6 +136,11 @@ struct ckpt_hdr_task {
 
 	__u64 set_child_tid;
 	__u64 clear_child_tid;
+
+	__u32 compat_robust_futex_head_len;
+	__u32 compat_robust_futex_list; /* a compat __user ptr */
+	__u32 robust_futex_head_len;
+	__u64 robust_futex_list; /* a __user ptr */
 } __attribute__((aligned(8)));
 
 /* restart blocks */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index af931ee..f444cf0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -165,7 +165,8 @@ struct compat_robust_list_head {
 };
 
 extern void compat_exit_robust_list(struct task_struct *curr);
-
+extern long do_compat_set_robust_list(struct compat_robust_list_head __user *head,
+				      compat_size_t len);
 asmlinkage long
 compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
 			   compat_size_t len);
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 4326f81..2e126a9 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -185,6 +185,7 @@ union futex_key {
 #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
 
 #ifdef CONFIG_FUTEX
+extern long do_set_robust_list(struct robust_list_head __user *head, size_t len);
 extern void exit_robust_list(struct task_struct *curr);
 extern void exit_pi_state_list(struct task_struct *curr);
 extern int futex_cmpxchg_enabled;
diff --git a/kernel/futex.c b/kernel/futex.c
index def86c8..1ef7c8e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2278,13 +2278,7 @@ out:
  * the list. There can only be one such pending lock.
  */
 
-/**
- * sys_set_robust_list - set the robust-futex list head of a task
- * @head: pointer to the list-head
- * @len: length of the list-head, as userspace expects
- */
-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
-		size_t, len)
+long do_set_robust_list(struct robust_list_head __user *head, size_t len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -2300,6 +2294,17 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
 }
 
 /**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+		size_t, len)
+{
+	return do_set_robust_list(head, len);
+}
+
+/**
  * sys_get_robust_list - get the robust-futex list head of a task
  * @pid: pid of the process [zero for current task]
  * @head_ptr: pointer to a list-head pointer, the kernel fills it in
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 2357165..5e1a169 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -114,9 +114,9 @@ void compat_exit_robust_list(struct task_struct *curr)
 	}
 }
 
-asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
-			   compat_size_t len)
+long
+do_compat_set_robust_list(struct compat_robust_list_head __user *head,
+			  compat_size_t len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -130,6 +130,13 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
 }
 
 asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len)
+{
+	return do_compat_set_robust_list(head, len);
+}
+
+asmlinkage long
 compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			   compat_size_t __user *len_ptr)
 {
-- 
1.6.0.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2009-09-24  0:29 UTC|newest]

Thread overview: 101+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-23 23:50 [PATCH 00/80] Kernel based checkpoint/restart [v18] Oren Laadan
2009-09-23 23:50 ` [PATCH v18 01/80] c/r: extend arch_setup_additional_pages() Oren Laadan
2009-09-23 23:50 ` [PATCH v18 02/80] x86: ptrace debugreg checks rewrite Oren Laadan
2009-09-23 23:50 ` [PATCH v18 03/80] c/r: break out new_user_ns() Oren Laadan
2009-09-23 23:50 ` [PATCH v18 04/80] c/r: split core function out of some set*{u,g}id functions Oren Laadan
2009-09-23 23:50 ` [PATCH v18 05/80] cgroup freezer: Fix buggy resume test for tasks frozen with cgroup freezer Oren Laadan
2009-09-23 23:50 ` [PATCH v18 06/80] cgroup freezer: Update stale locking comments Oren Laadan
2009-09-23 23:50 ` [PATCH v18 07/80] cgroup freezer: Add CHECKPOINTING state to safeguard container checkpoint Oren Laadan
2009-09-23 23:50 ` [PATCH v18 08/80] cgroup freezer: interface to freeze a cgroup from within the kernel Oren Laadan
2009-09-23 23:50 ` [PATCH v18 09/80] Namespaces submenu Oren Laadan
2009-09-23 23:50 ` [PATCH v18 10/80] c/r: make file_pos_read/write() public Oren Laadan
2009-09-23 23:50 ` [PATCH v18 11/80] pids 1/7: Factor out code to allocate pidmap page Oren Laadan
2009-09-23 23:50 ` [PATCH v18 12/80] pids 2/7: Have alloc_pidmap() return actual error code Oren Laadan
2009-09-23 23:50 ` [PATCH v18 13/80] pids 3/7: Add target_pid parameter to alloc_pidmap() Oren Laadan
2009-09-23 23:50 ` [PATCH v18 14/80] pids 4/7: Add target_pids parameter to alloc_pid() Oren Laadan
2009-09-23 23:50 ` [PATCH v18 15/80] pids 5/7: Add target_pids parameter to copy_process() Oren Laadan
2009-09-23 23:50 ` [PATCH v18 16/80] pids 6/7: Define do_fork_with_pids() Oren Laadan
2009-09-23 23:50 ` [PATCH v18 17/80] pids 7/7: Define clone_with_pids syscall Oren Laadan
2009-09-23 23:50 ` [PATCH v18 18/80] c/r: create syscalls: sys_checkpoint, sys_restart Oren Laadan
2009-09-23 23:50 ` [PATCH v18 19/80] c/r: documentation Oren Laadan
2009-09-23 23:51 ` [PATCH v18 20/80] c/r: basic infrastructure for checkpoint/restart Oren Laadan
2009-09-24 16:03   ` Daniel Walker
2009-09-24 17:33     ` Oren Laadan
2009-09-23 23:51 ` [PATCH v18 21/80] c/r: x86_32 support " Oren Laadan
2009-09-23 23:51 ` [PATCH v18 22/80] c/r: external checkpoint of a task other than ourself Oren Laadan
2009-09-23 23:51 ` [PATCH v18 23/80] c/r: export functionality used in next patch for restart-blocks Oren Laadan
2009-09-23 23:51 ` [PATCH v18 24/80] c/r: restart-blocks Oren Laadan
2009-09-23 23:51 ` [PATCH v18 25/80] c/r: checkpoint multiple processes Oren Laadan
2009-09-23 23:51 ` [PATCH v18 26/80] c/r: restart " Oren Laadan
2009-09-23 23:51 ` [PATCH v18 27/80] c/r: introduce PF_RESTARTING, and skip notification on exit Oren Laadan
2009-09-23 23:51 ` [PATCH v18 28/80] c/r: support for zombie processes Oren Laadan
2009-09-23 23:51 ` Oren Laadan [this message]
2009-09-23 23:51 ` [PATCH v18 30/80] c/r: infrastructure for shared objects Oren Laadan
2009-09-23 23:51 ` [PATCH v18 31/80] c/r: detect resource leaks for whole-container checkpoint Oren Laadan
2009-09-23 23:51 ` [PATCH v18 32/80] deferqueue: generic queue to defer work Oren Laadan
2009-09-23 23:51 ` [PATCH v18 33/80] c/r: introduce new 'file_operations': ->checkpoint, ->collect() Oren Laadan
2009-09-23 23:51 ` [PATCH v18 34/80] c/r: dump open file descriptors Oren Laadan
2009-09-23 23:51 ` [PATCH v18 35/80] c/r: restore " Oren Laadan
2009-09-23 23:51 ` [PATCH v18 36/80] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
2009-09-23 23:51 ` [PATCH v18 37/80] c/r: dump memory address space (private memory) Oren Laadan
2009-09-23 23:51 ` [PATCH v18 38/80] c/r: restore " Oren Laadan
2009-09-23 23:51 ` [PATCH v18 39/80] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
2009-09-23 23:51 ` [PATCH v18 40/80] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
2009-09-23 23:51 ` [PATCH v18 41/80] Add the checkpoint operation for opened files of generic filesystems Oren Laadan
2009-09-23 23:51 ` [PATCH v18 42/80] c/r: export shmem_getpage() to support shared memory Oren Laadan
2009-09-23 23:51 ` [PATCH v18 43/80] c/r: dump anonymous- and file-mapped- " Oren Laadan
2009-09-23 23:51 ` [PATCH v18 44/80] c/r: restore " Oren Laadan
2009-09-23 23:51 ` [PATCH v18 45/80] splice: export pipe/file-to-pipe/file functionality Oren Laadan
2009-09-23 23:51 ` [PATCH v18 46/80] c/r: support for open pipes Oren Laadan
2009-09-23 23:51 ` [PATCH v18 47/80] c/r: checkpoint and restore FIFOs Oren Laadan
2009-09-23 23:51 ` [PATCH v18 48/80] c/r: make ckpt_may_checkpoint_task() check each namespace individually Oren Laadan
2009-09-23 23:51 ` [PATCH v18 49/80] c/r: support for UTS namespace Oren Laadan
2009-09-29 18:13   ` Nikita V. Youshchenko
2009-09-29 18:51     ` Serge E. Hallyn
2009-09-29 23:01     ` Oren Laadan
2009-09-23 23:51 ` [PATCH v18 50/80] c/r (ipc): allow allocation of a desired ipc identifier Oren Laadan
2009-09-23 23:51 ` [PATCH v18 51/80] c/r: save and restore sysvipc namespace basics Oren Laadan
2009-09-23 23:51 ` [PATCH v18 52/80] c/r: support share-memory sysv-ipc Oren Laadan
2009-09-23 23:51 ` [PATCH v18 53/80] c/r: support message-queues sysv-ipc Oren Laadan
2009-09-23 23:51 ` [PATCH v18 54/80] c/r: support semaphore sysv-ipc Oren Laadan
2009-09-23 23:51 ` [PATCH v18 55/80] c/r: (s390): expose a constant for the number of words (CRs) Oren Laadan
2009-09-23 23:51 ` [PATCH v18 56/80] c/r: add CKPT_COPY() macro Oren Laadan
2009-09-23 23:51 ` [PATCH v18 57/80] c/r: define s390-specific checkpoint-restart code Oren Laadan
2009-09-23 23:51 ` [PATCH v18 58/80] c/r: clone_with_pids: define the s390 syscall Oren Laadan
2009-09-23 23:51 ` [PATCH v18 59/80] c/r: capabilities: define checkpoint and restore fns Oren Laadan
2009-09-23 23:51 ` [PATCH v18 60/80] c/r: checkpoint and restore task credentials Oren Laadan
2009-09-23 23:51 ` [PATCH v18 61/80] c/r: restore file->f_cred Oren Laadan
2009-09-23 23:51 ` [PATCH v18 62/80] c/r: checkpoint and restore (shared) task's sighand_struct Oren Laadan
2009-09-23 23:51 ` [PATCH v18 63/80] c/r: [signal 1/4] blocked and template for shared signals Oren Laadan
2009-09-23 23:51 ` [PATCH v18 64/80] c/r: [signal 2/4] checkpoint/restart of rlimit Oren Laadan
2009-09-23 23:51 ` [PATCH v18 65/80] c/r: [signal 3/4] pending signals (private, shared) Oren Laadan
2009-09-23 23:51 ` [PATCH v18 66/80] c/r: [signal 4/4] support for real/virt/prof itimers Oren Laadan
2009-09-23 23:51 ` [PATCH v18 67/80] Expose may_setuid() in user.h and add may_setgid() (v2) Oren Laadan
2009-09-23 23:51 ` [PATCH v18 68/80] Add common socket helpers to unify the security hooks Oren Laadan
2009-09-23 23:51 ` [PATCH v18 69/80] c/r: introduce checkpoint/restore methods to struct proto_ops Oren Laadan
2009-09-23 23:51 ` [PATCH v18 70/80] c/r: Add AF_UNIX support (v12) Oren Laadan
2009-09-23 23:51 ` [PATCH v18 71/80] c/r: [pty 1/2] allow allocation of desired pty slave Oren Laadan
2009-09-23 23:51 ` [PATCH v18 72/80] c/r: [pty 2/2] support for pseudo terminals Oren Laadan
2009-09-23 23:51 ` [PATCH v18 73/80] c/r: correctly restore pgid Oren Laadan
2009-09-23 23:51 ` [PATCH v18 74/80] c/r: support for controlling terminal and job control Oren Laadan
2009-09-23 23:51 ` [PATCH v18 75/80] powerpc: reserve checkpoint arch identifiers Oren Laadan
2009-09-23 23:51 ` [PATCH v18 76/80] powerpc: provide APIs for validating and updating DABR Oren Laadan
2009-09-23 23:51 ` [PATCH v18 77/80] powerpc: checkpoint/restart implementation Oren Laadan
2009-09-23 23:51 ` [PATCH v18 78/80] powerpc: wire up checkpoint and restart syscalls Oren Laadan
2009-09-23 23:51 ` [PATCH v18 79/80] powerpc: enable checkpoint support in Kconfig Oren Laadan
2009-09-23 23:52 ` [PATCH v18 80/80] powerpc: clone_with_pids implementation Oren Laadan
2009-09-24 13:05 ` [PATCH 00/80] Kernel based checkpoint/restart [v18] Rishikesh
2009-09-24 16:27   ` Dan Smith
2009-09-24 18:20     ` Rishikesh
2009-09-24 22:41 ` Andrew Morton
2009-09-25 13:59   ` Dan Smith
2009-09-26 14:56     ` Ingo Molnar
2009-09-28 16:37   ` Serge E. Hallyn
2009-09-29 13:29     ` Daniel Lezcano
2009-09-30 14:39       ` Dan Smith
2009-09-30 16:16         ` Daniel Lezcano
2009-09-30 16:29           ` Dan Smith
2009-09-30 18:01             ` Daniel Lezcano
2009-09-30 18:28               ` Dan Smith
2009-09-30 22:08                 ` Daniel Lezcano
2009-09-29 18:31   ` Oren Laadan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1253749920-18673-30-git-send-email-orenl@librato.com \
    --to=orenl@librato.com \
    --cc=akpm@linux-foundation.org \
    --cc=containers@lists.linux-foundation.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthltc@us.ibm.com \
    --cc=mingo@elte.hu \
    --cc=serue@us.ibm.com \
    --cc=torvalds@osdl.org \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox