Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--criu/cgroup.c140
-rw-r--r--criu/cr-restore.c19
-rw-r--r--criu/include/cgroup.h2
-rw-r--r--criu/include/namespaces.h17
-rw-r--r--criu/include/restorer.h2
-rw-r--r--criu/include/servicefd.h1
-rw-r--r--criu/namespaces.c65
-rw-r--r--criu/pie/restorer.c107
8 files changed, 319 insertions, 34 deletions
diff --git a/criu/cgroup.c b/criu/cgroup.c
index b238b6402..918827d99 100644
--- a/criu/cgroup.c
+++ b/criu/cgroup.c
@@ -8,6 +8,7 @@
#include <ftw.h>
#include <libgen.h>
#include <sched.h>
+#include <sys/wait.h>
#include "common/list.h"
#include "xmalloc.h"
@@ -55,6 +56,7 @@ static u32 cg_set_ids = 1;
static LIST_HEAD(cgroups);
static unsigned int n_cgroups;
+static pid_t cgroupd_pid;
static CgSetEntry *find_rst_set_by_id(u32 id)
{
@@ -1935,6 +1937,136 @@ static int prepare_cgroup_sfd(CgroupEntry *ce)
return 0;
}
+/*
+ * If a thread is a different cgroup set than the main thread in process,
+ * it means it is in a threaded controller. This daemon receives the cg_set
+ * number from the restored thread and move this thread to the correct
+ * cgroup controllers
+ */
+static int cgroupd(int sk)
+{
+ pr_info("cgroud: Daemon started\n");
+
+ while (1) {
+ struct unsc_msg um;
+ uns_call_t call;
+ pid_t tid;
+ int fd, cg_set, i;
+ CgSetEntry *cg_set_entry;
+ int ret;
+
+ unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL);
+ ret = recvmsg(sk, &um.h, 0);
+ if (ret <= 0) {
+ pr_perror("cgroupd: recv req error");
+ return -1;
+ }
+
+ unsc_msg_pid_fd(&um, &tid, &fd);
+ pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set);
+
+ cg_set_entry = find_rst_set_by_id(cg_set);
+ if (!cg_set_entry) {
+ pr_err("cgroupd: No set found %d\n", cg_set);
+ return -1;
+ }
+
+ for (i = 0; i < cg_set_entry->n_ctls; i++) {
+ int j, aux_off;
+ CgMemberEntry *ce = cg_set_entry->ctls[i];
+ char aux[PATH_MAX];
+ CgControllerEntry *ctrl = NULL;
+
+ for (j = 0; j < n_controllers; j++) {
+ CgControllerEntry *cur = controllers[j];
+ if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) {
+ ctrl = cur;
+ break;
+ }
+ }
+
+ if (!ctrl) {
+ pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path);
+ return -1;
+ }
+
+ /*
+ * This is not a threaded controller, all threads in this
+ * process must be in this controller. Main thread has been
+ * restored, so this thread is in this controller already.
+ */
+ if (!ctrl->is_threaded)
+ continue;
+
+ aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
+ snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path);
+
+ /*
+ * Cgroupd runs outside of the namespaces so we don't
+ * need to use userns_call here
+ */
+ if (userns_move(aux, 0, tid)) {
+ pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path);
+ return -1;
+ }
+ }
+
+ /*
+ * We only want to send the cred which contains thread id back.
+ * The restored thread recvmsg(MSG_PEEK) until it gets its own
+ * thread id.
+ */
+ unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid);
+ if (sendmsg(sk, &um.h, 0) <= 0) {
+ pr_perror("cgroupd: send req error");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int stop_cgroupd(void)
+{
+ if (cgroupd_pid) {
+ sigset_t blockmask, oldmask;
+
+ /*
+ * Block the SIGCHLD signal to avoid triggering
+ * sigchld_handler()
+ */
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGCHLD);
+ sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+ kill(cgroupd_pid, SIGTERM);
+ waitpid(cgroupd_pid, NULL, 0);
+
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+ }
+
+ return 0;
+}
+
+static int prepare_cgroup_thread_sfd(void)
+{
+ int sk;
+
+ sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd);
+ if (sk < 0) {
+ pr_err("failed to start cgroupd\n");
+ return -1;
+ }
+
+ if (install_service_fd(CGROUPD_SK, sk) < 0) {
+ kill(cgroupd_pid, SIGKILL);
+ waitpid(cgroupd_pid, NULL, 0);
+ return -1;
+ }
+
+ return 0;
+}
+
static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot)
{
size_t dirlen = strlen(*dir_name);
@@ -2089,15 +2221,19 @@ int prepare_cgroup(void)
n_controllers = ce->n_controllers;
controllers = ce->controllers;
- if (n_sets)
+ if (n_sets) {
/*
* We rely on the fact that all sets contain the same
* set of controllers. This is checked during dump
* with cg_set_compare(CGCMP_ISSUB) call.
*/
ret = prepare_cgroup_sfd(ce);
- else
+ if (ret < 0)
+ return ret;
+ ret = prepare_cgroup_thread_sfd();
+ } else {
ret = 0;
+ }
return ret;
}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index d7d3d8edb..78f2a9701 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1349,7 +1349,12 @@ static inline int fork_with_pid(struct pstree_item *item)
return -1;
item->pid->state = ca.core->tc->task_state;
- rsti(item)->cg_set = ca.core->tc->cg_set;
+
+ /* Zombie task's cg_set is stored in task_core */
+ if (item->pid->state == TASK_DEAD)
+ rsti(item)->cg_set = ca.core->tc->cg_set;
+ else
+ rsti(item)->cg_set = ca.core->thread_core->cg_set;
if (ca.core->tc->has_stop_signo)
item->pid->stop_signo = ca.core->tc->stop_signo;
@@ -2376,6 +2381,10 @@ skip_ns_bouncing:
if (ret < 0)
goto out_kill;
+ ret = stop_cgroupd();
+ if (ret < 0)
+ goto out_kill;
+
ret = move_veth_to_bridge();
if (ret < 0)
goto out_kill;
@@ -3812,6 +3821,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
core_get_tls(tcore, &thread_args[i].tls);
+ if (rsti(current)->cg_set != tcore->thread_core->cg_set) {
+ thread_args[i].cg_set = tcore->thread_core->cg_set;
+ thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK));
+ } else {
+ thread_args[i].cg_set = -1;
+ }
+
ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core);
if (ret)
goto err;
@@ -3906,6 +3922,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
close_service_fd(USERNSD_SK);
close_service_fd(FDSTORE_SK_OFF);
close_service_fd(RPC_SK_OFF);
+ close_service_fd(CGROUPD_SK);
__gcov_flush();
diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h
index 5a254559d..93f61539c 100644
--- a/criu/include/cgroup.h
+++ b/criu/include/cgroup.h
@@ -96,4 +96,6 @@ extern void put_ctls(struct list_head *);
int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups);
+int stop_cgroupd(void);
+
#endif /* __CR_CGROUP_H__ */
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index e2ea6e17f..183a3b852 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -1,6 +1,8 @@
#ifndef __CR_NS_H__
#define __CR_NS_H__
+#include <sys/socket.h>
+
#include "common/compiler.h"
#include "files.h"
#include "common/list.h"
@@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data);
extern struct ns_id *get_socket_ns(int lfd);
extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd);
+struct unsc_msg {
+ struct msghdr h;
+ /*
+ * 0th is the call address
+ * 1st is the flags
+ * 2nd is the optional (NULL in response) arguments
+ */
+ struct iovec iov[3];
+ char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
+};
+
+extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid);
+extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd);
+extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk));
+
#endif /* __CR_NS_H__ */
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index d642765e3..bc0beb5cb 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -121,6 +121,8 @@ struct thread_restore_args {
bool seccomp_force_tsync;
char comm[TASK_COMM_LEN];
+ int cg_set;
+ int cgroupd_sk;
} __aligned(64);
typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args);
diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h
index c6979de7f..4265d94ed 100644
--- a/criu/include/servicefd.h
+++ b/criu/include/servicefd.h
@@ -24,6 +24,7 @@ enum sfd_type {
*/
ROOT_FD_OFF, /* Root of the namespace we dump/restore */
CGROUP_YARD,
+ CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */
USERNSD_SK, /* Socket for usernsd */
NS_FD_OFF, /* Node's net namespace fd */
TRANSPORT_FD_OFF, /* to transfer file descriptors */
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 286073ff6..0dc19d5b6 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -4,7 +4,6 @@
#include <stdlib.h>
#include <sys/prctl.h>
#include <grp.h>
-#include <sys/socket.h>
#include <sys/un.h>
#include <stdarg.h>
#include <signal.h>
@@ -1218,20 +1217,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map)
return 0;
}
-struct unsc_msg {
- struct msghdr h;
- /*
- * 0th is the call address
- * 1st is the flags
- * 2nd is the optional (NULL in response) arguments
- */
- struct iovec iov[3];
- char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
-};
-
static int usernsd_pid;
-static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd)
+inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid)
{
struct cmsghdr *ch;
struct ucred *ucred;
@@ -1269,7 +1257,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void
ch->cmsg_type = SCM_CREDENTIALS;
ucred = (struct ucred *)CMSG_DATA(ch);
- ucred->pid = getpid();
+ if (pid)
+ ucred->pid = *pid;
+ else
+ ucred->pid = getpid();
ucred->uid = getuid();
ucred->gid = getgid();
@@ -1284,7 +1275,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void
}
}
-static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd)
+void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd)
{
struct cmsghdr *ch;
struct ucred *ucred;
@@ -1322,7 +1313,7 @@ static int usernsd(int sk)
int flags, fd, ret;
pid_t pid;
- unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);
+ unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL);
if (recvmsg(sk, &um.h, 0) <= 0) {
pr_perror("uns: recv req error");
return -1;
@@ -1367,7 +1358,7 @@ static int usernsd(int sk)
else
fd = -1;
- unsc_msg_init(&um, &call, &ret, NULL, 0, fd);
+ unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL);
if (sendmsg(sk, &um.h, 0) <= 0) {
pr_perror("uns: send resp error");
return -1;
@@ -1418,7 +1409,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg,
/* Send the request */
- unsc_msg_init(&um, &call, &flags, arg, arg_size, fd);
+ unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL);
ret = sendmsg(sk, &um.h, 0);
if (ret <= 0) {
pr_perror("uns: send req error");
@@ -1433,7 +1424,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg,
/* Get the response back */
- unsc_msg_init(&um, &call, &res, NULL, 0, 0);
+ unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL);
ret = recvmsg(sk, &um.h, 0);
if (ret <= 0) {
pr_perror("uns: recv resp error");
@@ -1454,14 +1445,11 @@ out:
return ret;
}
-static int start_usernsd(void)
+int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk))
{
int sk[2];
int one = 1;
- if (!(root_ns_mask & CLONE_NEWUSER))
- return 0;
-
/*
* Seqpacket to
*
@@ -1490,24 +1478,39 @@ static int start_usernsd(void)
return -1;
}
- usernsd_pid = fork();
- if (usernsd_pid < 0) {
- pr_perror("Can't fork usernsd");
+ *pid = fork();
+ if (*pid < 0) {
+ pr_perror("Can't unix daemon");
close(sk[0]);
close(sk[1]);
return -1;
}
- if (usernsd_pid == 0) {
+ if (*pid == 0) {
int ret;
-
close(sk[0]);
- ret = usernsd(sk[1]);
+ ret = daemon_func(sk[1]);
exit(ret);
}
-
close(sk[1]);
- if (install_service_fd(USERNSD_SK, sk[0]) < 0) {
+
+ return sk[0];
+}
+
+static int start_usernsd(void)
+{
+ int sk;
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return 0;
+
+ sk = start_unix_cred_daemon(&usernsd_pid, usernsd);
+ if (sk < 0) {
+ pr_err("failed to start usernsd\n");
+ return -1;
+ }
+
+ if (install_service_fd(USERNSD_SK, sk) < 0) {
kill(usernsd_pid, SIGKILL);
waitpid(usernsd_pid, NULL, 0);
return -1;
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 0e98cb3da..99cff1f7d 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -17,6 +17,7 @@
#include <sys/resource.h>
#include <signal.h>
#include <sys/inotify.h>
+#include <sys/socket.h>
#include "linux/userfaultfd.h"
@@ -586,6 +587,103 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig
ARCH_RT_SIGRETURN(new_sp, sigframe);
}
+static int send_cg_set(int sk, int cg_set)
+{
+ struct cmsghdr *ch;
+ struct msghdr h;
+ /*
+ * 0th is the dummy call address for compatibility with userns helper
+ * 1st is the cg_set
+ */
+ struct iovec iov[2];
+ char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {};
+ int ret, *dummy = NULL;
+ struct ucred *ucred;
+
+ iov[0].iov_base = &dummy;
+ iov[0].iov_len = sizeof(dummy);
+ iov[1].iov_base = &cg_set;
+ iov[1].iov_len = sizeof(cg_set);
+
+ h.msg_iov = iov;
+ h.msg_iovlen = sizeof(iov) / sizeof(struct iovec);
+ h.msg_name = NULL;
+ h.msg_namelen = 0;
+ h.msg_flags = 0;
+
+ h.msg_control = cmsg;
+ h.msg_controllen = sizeof(cmsg);
+ ch = CMSG_FIRSTHDR(&h);
+ ch->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+ ch->cmsg_level = SOL_SOCKET;
+ ch->cmsg_type = SCM_CREDENTIALS;
+
+ ucred = (struct ucred *)CMSG_DATA(ch);
+ /*
+ * We still have privilege in this namespace so we can send
+ * thread id instead of pid of main thread, uid, gid as 0
+ * since these 2 are ignored in cgroupd
+ */
+ ucred->pid = sys_gettid();
+ ucred->uid = 0;
+ ucred->gid = 0;
+
+ ret = sys_sendmsg(sk, &h, 0);
+ if (ret < 0) {
+ pr_err("Unable to send packet to cgroupd %d\n", ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * As this socket is shared among threads, recvmsg(MSG_PEEK)
+ * from the socket until getting its own thread id as an
+ * acknowledge of successful threaded cgroup fixup
+ */
+static int recv_cg_set_restore_ack(int sk)
+{
+ struct cmsghdr *ch;
+ struct msghdr h = {};
+ char cmsg[CMSG_SPACE(sizeof(struct ucred))];
+ struct ucred *cred;
+ int ret;
+
+ h.msg_control = cmsg;
+ h.msg_controllen = sizeof(cmsg);
+
+ while (1) {
+ ret = sys_recvmsg(sk, &h, MSG_PEEK);
+ if (ret < 0) {
+ pr_err("Unable to peek from cgroupd %d\n", ret);
+ return -1;
+ }
+
+ if (h.msg_controllen != sizeof(cmsg)) {
+ pr_err("The message from cgroupd is truncated\n");
+ return -1;
+ }
+
+ ch = CMSG_FIRSTHDR(&h);
+ cred = (struct ucred *)CMSG_DATA(ch);
+ if (cred->pid != sys_gettid())
+ continue;
+
+ /*
+ * Actual remove message from recv queue of socket
+ */
+ ret = sys_recvmsg(sk, &h, 0);
+ if (ret < 0) {
+ pr_err("Unable to receive from cgroupd %d\n", ret);
+ return -1;
+ }
+
+ break;
+ }
+ return 0;
+}
+
/*
* Threads restoration via sigreturn. Note it's locked
* routine and calls for unlock at the end.
@@ -613,6 +711,15 @@ long __export_restore_thread(struct thread_restore_args *args)
rt_sigframe = (void *)&args->mz->rt_sigframe;
+ if (args->cg_set != -1) {
+ pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set);
+ if (send_cg_set(args->cgroupd_sk, args->cg_set))
+ goto core_restore_end;
+ if (recv_cg_set_restore_ack(args->cgroupd_sk))
+ goto core_restore_end;
+ sys_close(args->cgroupd_sk);
+ }
+
if (restore_thread_common(args))
goto core_restore_end;