linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Oren Laadan <orenl@librato.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@osdl.org>,
	containers@lists.linux-foundation.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-api@vger.kernel.org, Serge Hallyn <serue@us.ibm.com>,
	Dave Hansen <dave@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@elte.hu>, "H. Peter Anvin" <hpa@zytor.com>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Pavel Emelyanov <xemul@openvz.org>,
	Alexey Dobriyan <adobriyan@gmail.com>,
	Oren Laadan <orenl@librato.com>,
	Oren Laadan <orenl@cs.columbia.edu>
Subject: [RFC v17][PATCH 28/60] c/r: support for zombie processes
Date: Wed, 22 Jul 2009 05:59:50 -0400	[thread overview]
Message-ID: <1248256822-23416-29-git-send-email-orenl@librato.com> (raw)
In-Reply-To: <1248256822-23416-1-git-send-email-orenl@librato.com>

During checkpoint, a zombie processes need only save p->comm,
p->state, p->exit_state, and p->exit_code.

During restart, zombie processes are created like all other
processes. They validate the saved exit_code restore p->comm
and p->exit_code. Then they call do_exit() instead of waking
up the next task in line.

But before, they place the @ctx in p->checkpoint_ctx, so that
only at exit time they will wake up the next task in line,
and drop the reference to the @ctx.

This provides the guarantee that when the coordinator's wait
completes, all normal tasks completed their restart, and all
zombie tasks are already zombified (as opposed to perhap only
becoming a zombie).

Changelog[v17]:
  - Validate t->exit_signal for both threads and leader
  - Skip zombies in most of may_checkpoint_task()
  - Save/restore t->pdeath_signal
  - Validate ->exit_signal and ->pdeath_signal

Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
---
 checkpoint/checkpoint.c        |   12 +++++--
 checkpoint/process.c           |   67 +++++++++++++++++++++++++++++++++++-----
 checkpoint/restart.c           |   40 +++++++++++++++++++++---
 include/linux/checkpoint.h     |    1 +
 include/linux/checkpoint_hdr.h |    1 +
 5 files changed, 104 insertions(+), 17 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 57f59de..fb14585 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -280,8 +280,8 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 
 	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
 
-	if (t->state == TASK_DEAD) {
-		pr_warning("c/r: task %d is TASK_DEAD\n", task_pid_vnr(t));
+	if (t->exit_state == EXIT_DEAD) {
+		pr_warning("c/r: task %d is EXIT_DEAD\n", task_pid_vnr(t));
 		return -EAGAIN;
 	}
 
@@ -291,6 +291,10 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 		return -EPERM;
 	}
 
+	/* zombies are cool (and also don't have nsproxy, below...) */
+	if (t->exit_state)
+		return 0;
+
 	/* verify that all tasks belongs to same freezer cgroup */
 	if (t != current && !in_same_cgroup_freezer(t, ctx->root_freezer)) {
 		__ckpt_write_err(ctx, "task %d (%s) not frozen (wrong cgroup)",
@@ -309,8 +313,8 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 	 * FIX: for now, disallow siblings of container init created
 	 * via CLONE_PARENT (unclear if they will remain possible)
 	 */
-	if (ctx->root_init && t != root && t->tgid != root->tgid &&
-	    t->real_parent == root->real_parent) {
+	if (ctx->root_init && t != root &&
+	    t->real_parent == root->real_parent && t->tgid != root->tgid) {
 		__ckpt_write_err(ctx, "task %d (%s) is sibling of root",
 				 task_pid_vnr(t), t->comm);
 		return -EINVAL;
diff --git a/checkpoint/process.c b/checkpoint/process.c
index a0bf344..a67c389 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -35,12 +35,18 @@ static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
 	h->state = t->state;
 	h->exit_state = t->exit_state;
 	h->exit_code = t->exit_code;
-	h->exit_signal = t->exit_signal;
 
-	h->set_child_tid = t->set_child_tid;
-	h->clear_child_tid = t->clear_child_tid;
+	if (t->exit_state) {
+		/* zombie - skip remaining state */
+		BUG_ON(t->exit_state != EXIT_ZOMBIE);
+	} else {
+		/* FIXME: save remaining relevant task_struct fields */
+		h->exit_signal = t->exit_signal;
+		h->pdeath_signal = t->pdeath_signal;
 
-	/* FIXME: save remaining relevant task_struct fields */
+		h->set_child_tid = t->set_child_tid;
+		h->clear_child_tid = t->clear_child_tid;
+	}
 
 	ret = ckpt_write_obj(ctx, &h->h);
 	ckpt_hdr_put(ctx, h);
@@ -169,6 +175,11 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
 	ckpt_debug("task %d\n", ret);
 	if (ret < 0)
 		goto out;
+
+	/* zombie - we're done here */
+	if (t->exit_state)
+		return 0;
+
 	ret = checkpoint_thread(ctx, t);
 	ckpt_debug("thread %d\n", ret);
 	if (ret < 0)
@@ -187,6 +198,19 @@ int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
  * Restart
  */
 
+static inline int valid_exit_code(int exit_code)
+{
+	if (exit_code >= 0x10000)
+		return 0;
+	if (exit_code & 0xff) {
+		if (exit_code & ~0xff)
+			return 0;
+		if (!valid_signal(exit_code & 0xff))
+			return 0;
+	}
+	return 1;
+}
+
 /* read the task_struct into the current task */
 static int restore_task_struct(struct ckpt_ctx *ctx)
 {
@@ -198,15 +222,37 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 
+	ret = -EINVAL;
+	if (h->state == TASK_DEAD) {
+		if (h->exit_state != EXIT_ZOMBIE)
+			goto out;
+		if (!valid_exit_code(h->exit_code))
+			goto out;
+		t->exit_code = h->exit_code;
+	} else {
+		if (h->exit_code)
+			goto out;
+		if ((thread_group_leader(t) && !valid_signal(h->exit_signal)) ||
+		    (!thread_group_leader(t) && h->exit_signal != -1))
+			goto out;
+		if (!valid_signal(h->pdeath_signal))
+			goto out;
+
+		/* FIXME: restore remaining relevant task_struct fields */
+		t->exit_signal = h->exit_signal;
+		t->pdeath_signal = h->pdeath_signal;
+
+		t->set_child_tid = h->set_child_tid;
+		t->clear_child_tid = h->clear_child_tid;
+	}
+
 	memset(t->comm, 0, TASK_COMM_LEN);
 	ret = _ckpt_read_string(ctx, t->comm, TASK_COMM_LEN);
 	if (ret < 0)
 		goto out;
 
-	t->set_child_tid = h->set_child_tid;
-	t->clear_child_tid = h->clear_child_tid;
-
-	/* FIXME: restore remaining relevant task_struct fields */
+	/* return 1 for zombie, 0 otherwise */
+	ret = (h->state == TASK_DEAD ? 1 : 0);
  out:
 	ckpt_hdr_put(ctx, h);
 	return ret;
@@ -326,6 +372,11 @@ int restore_task(struct ckpt_ctx *ctx)
 	ckpt_debug("task %d\n", ret);
 	if (ret < 0)
 		goto out;
+
+	/* zombie - we're done here */
+	if (ret)
+		goto out;
+
 	ret = restore_thread(ctx);
 	ckpt_debug("thread %d\n", ret);
 	if (ret < 0)
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 65422e2..1b1f639 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -375,20 +375,17 @@ static inline void ckpt_notify_error(struct ckpt_ctx *ctx)
 	complete(&ctx->complete);
 }
 
-static int ckpt_activate_next(struct ckpt_ctx *ctx)
+int ckpt_activate_next(struct ckpt_ctx *ctx)
 {
 	struct task_struct *task;
-	int active;
 	pid_t pid;
 
-	active = ++ctx->active_pid;
-	if (active >= ctx->nr_pids) {
+	if (++ctx->active_pid >= ctx->nr_pids) {
 		complete(&ctx->complete);
 		return 0;
 	}
 
 	pid = get_active_pid(ctx);
-	ckpt_debug("active pid %d (%d < %d)\n", pid, active, ctx->nr_pids);
 
 	rcu_read_lock();
 	task = find_task_by_pid_ns(pid, ctx->root_nsproxy->pid_ns);
@@ -413,6 +410,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
 	ret = wait_event_interruptible(ctx->waitq,
 				       is_task_active(ctx, pid) ||
 				       ckpt_test_ctx_error(ctx));
+	ckpt_debug("active %d < %d (ret %d)\n",
+		   ctx->active_pid, ctx->nr_pids, ret);
 	if (!ret && ckpt_test_ctx_error(ctx)) {
 		force_sig(SIGKILL, current);
 		ret = -EBUSY;
@@ -468,6 +467,8 @@ static int do_restore_task(void)
 		return -EAGAIN;
 	}
 
+	current->flags |= PF_RESTARTING;
+
 	/* wait for our turn, do the restore, and tell next task in line */
 	ret = wait_task_active(ctx);
 	if (ret < 0)
@@ -477,6 +478,13 @@ static int do_restore_task(void)
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * zombie: we're done here; Save @ctx on task_struct, to be
+	 * used to ckpt_activate_next(), and released, from do_exit().
+	 */
+	if (ret)
+		do_exit(current->exit_code);
+
 	ret = ckpt_activate_next(ctx);
 	if (ret < 0)
 		goto out;
@@ -493,6 +501,7 @@ static int do_restore_task(void)
 		wake_up_all(&ctx->waitq);
 	}
 
+	current->flags &= ~PF_RESTARTING;
 	ckpt_ctx_put(ctx);
 	return ret;
 }
@@ -593,6 +602,7 @@ static int wait_all_tasks_finish(struct ckpt_ctx *ctx)
 
 	ret = wait_for_completion_interruptible(&ctx->complete);
 
+	ckpt_debug("final sync kflags %#lx\n", ctx->kflags);
 	if (ckpt_test_ctx_error(ctx))
 		ret = -EBUSY;
 	return ret;
@@ -820,3 +830,23 @@ long do_restart(struct ckpt_ctx *ctx, pid_t pid)
 
 	return ret;
 }
+
+/**
+ * exit_checkpoint - callback from do_exit to cleanup checkpoint state
+ * @tsk: terminating task
+ */
+void exit_checkpoint(struct task_struct *tsk)
+{
+	struct ckpt_ctx *ctx;
+
+	ctx = tsk->checkpoint_ctx;
+	tsk->checkpoint_ctx = NULL;
+
+	/* restarting zombies will acrivate next task in restart */
+	if (tsk->flags & PF_RESTARTING) {
+		if (ckpt_activate_next(ctx) < 0)
+			pr_warning("c/r: [%d] failed zombie exit\n", tsk->pid);
+	}
+
+	ckpt_ctx_put(ctx);
+}
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 44b692d..b6af5b9 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -85,6 +85,7 @@ extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
 extern long do_restart(struct ckpt_ctx *ctx, pid_t pid);
 
 /* task */
+extern int ckpt_activate_next(struct ckpt_ctx *ctx);
 extern int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t);
 extern int restore_task(struct ckpt_ctx *ctx);
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index c9a80dc..3f2db22 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -130,6 +130,7 @@ struct ckpt_hdr_task {
 	__u32 exit_state;
 	__u32 exit_code;
 	__u32 exit_signal;
+	__u32 pdeath_signal;
 
 	__u64 set_child_tid;
 	__u64 clear_child_tid;
-- 
1.6.0.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2009-07-22 10:10 UTC|newest]

Thread overview: 78+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-22  9:59 [RFC v17][PATCH 00/60] Kernel based checkpoint/restart Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 01/60] c/r: extend arch_setup_additional_pages() Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 02/60] x86: ptrace debugreg checks rewrite Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 03/60] c/r: break out new_user_ns() Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 04/60] c/r: split core function out of some set*{u,g}id functions Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 05/60] cgroup freezer: Fix buggy resume test for tasks frozen with cgroup freezer Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 06/60] cgroup freezer: Update stale locking comments Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 07/60] cgroup freezer: Add CHECKPOINTING state to safeguard container checkpoint Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 08/60] cgroup freezer: interface to freeze a cgroup from within the kernel Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 09/60] Namespaces submenu Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 10/60] c/r: make file_pos_read/write() public Oren Laadan
2009-07-23  2:33   ` KAMEZAWA Hiroyuki
2009-07-22  9:59 ` [RFC v17][PATCH 11/60] pids 1/7: Factor out code to allocate pidmap page Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 12/60] pids 2/7: Have alloc_pidmap() return actual error code Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 13/60] pids 3/7: Add target_pid parameter to alloc_pidmap() Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 14/60] pids 4/7: Add target_pids parameter to alloc_pid() Oren Laadan
2009-08-03 18:22   ` Serge E. Hallyn
2009-07-22  9:59 ` [RFC v17][PATCH 15/60] pids 5/7: Add target_pids parameter to copy_process() Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 16/60] pids 6/7: Define do_fork_with_pids() Oren Laadan
2009-08-03 18:26   ` Serge E. Hallyn
2009-08-04  8:37     ` Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 17/60] pids 7/7: Define clone_with_pids syscall Oren Laadan
2009-07-29  0:44   ` Sukadev Bhattiprolu
2009-07-22  9:59 ` [RFC v17][PATCH 18/60] c/r: create syscalls: sys_checkpoint, sys_restart Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 19/60] c/r: documentation Oren Laadan
2009-07-23 14:24   ` Serge E. Hallyn
2009-07-23 15:24     ` Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 20/60] c/r: basic infrastructure for checkpoint/restart Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 21/60] c/r: x86_32 support " Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 22/60] c/r: external checkpoint of a task other than ourself Oren Laadan
2009-07-22 17:52   ` Serge E. Hallyn
2009-07-23  4:32     ` Oren Laadan
2009-07-23 13:12       ` Serge E. Hallyn
2009-07-23 14:14         ` Oren Laadan
2009-07-23 14:54       ` Serge E. Hallyn
2009-07-23 14:47   ` Serge E. Hallyn
2009-07-23 15:33     ` Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 23/60] c/r: export functionality used in next patch for restart-blocks Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 24/60] c/r: restart-blocks Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 25/60] c/r: checkpoint multiple processes Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 26/60] c/r: restart " Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 27/60] c/r: introduce PF_RESTARTING, and skip notification on exit Oren Laadan
2009-07-22  9:59 ` Oren Laadan [this message]
2009-07-22  9:59 ` [RFC v17][PATCH 29/60] c/r: Save and restore the [compat_]robust_list member of the task struct Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 30/60] c/r: infrastructure for shared objects Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 31/60] c/r: detect resource leaks for whole-container checkpoint Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 32/60] c/r: introduce '->checkpoint()' method in 'struct file_operations' Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 33/60] c/r: dump open file descriptors Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 34/60] c/r: restore " Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 35/60] c/r: add generic '->checkpoint' f_op to ext fses Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 36/60] c/r: add generic '->checkpoint()' f_op to simple devices Oren Laadan
2009-07-22  9:59 ` [RFC v17][PATCH 37/60] c/r: introduce method '->checkpoint()' in struct vm_operations_struct Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 38/60] c/r: dump memory address space (private memory) Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 39/60] c/r: restore " Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 40/60] c/r: export shmem_getpage() to support shared memory Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 41/60] c/r: dump anonymous- and file-mapped- " Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 42/60] c/r: restore " Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 43/60] splice: export pipe/file-to-pipe/file functionality Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 44/60] c/r: support for open pipes Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 45/60] c/r: make ckpt_may_checkpoint_task() check each namespace individually Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 46/60] c/r: support for UTS namespace Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 47/60] deferqueue: generic queue to defer work Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 48/60] c/r (ipc): allow allocation of a desired ipc identifier Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 49/60] c/r: save and restore sysvipc namespace basics Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 50/60] c/r: support share-memory sysv-ipc Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 51/60] c/r: support message-queues sysv-ipc Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 52/60] c/r: support semaphore sysv-ipc Oren Laadan
2009-07-22 17:25   ` Cyrill Gorcunov
2009-07-23  3:46     ` Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 53/60] c/r: (s390): expose a constant for the number of words (CRs) Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 54/60] c/r: add CKPT_COPY() macro Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 55/60] c/r: define s390-specific checkpoint-restart code Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 56/60] c/r: clone_with_pids: define the s390 syscall Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 57/60] c/r: capabilities: define checkpoint and restore fns Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 58/60] c/r: checkpoint and restore task credentials Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 59/60] c/r: restore file->f_cred Oren Laadan
2009-07-22 10:00 ` [RFC v17][PATCH 60/60] c/r: checkpoint and restore (shared) task's sighand_struct Oren Laadan
2009-07-24 19:09 ` [RFC v17][PATCH 00/60] Kernel based checkpoint/restart Serge E. Hallyn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1248256822-23416-29-git-send-email-orenl@librato.com \
    --to=orenl@librato.com \
    --cc=adobriyan@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=containers@lists.linux-foundation.org \
    --cc=dave@linux.vnet.ibm.com \
    --cc=hpa@zytor.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@elte.hu \
    --cc=orenl@cs.columbia.edu \
    --cc=serue@us.ibm.com \
    --cc=torvalds@osdl.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox