diff options
-rw-r--r-- | criu/cgroup.c | 140 | ||||
-rw-r--r-- | criu/cr-restore.c | 19 | ||||
-rw-r--r-- | criu/include/cgroup.h | 2 | ||||
-rw-r--r-- | criu/include/namespaces.h | 17 | ||||
-rw-r--r-- | criu/include/restorer.h | 2 | ||||
-rw-r--r-- | criu/include/servicefd.h | 1 | ||||
-rw-r--r-- | criu/namespaces.c | 65 | ||||
-rw-r--r-- | criu/pie/restorer.c | 107 |
8 files changed, 319 insertions, 34 deletions
diff --git a/criu/cgroup.c b/criu/cgroup.c index b238b6402..918827d99 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,6 +8,7 @@ #include <ftw.h> #include <libgen.h> #include <sched.h> +#include <sys/wait.h> #include "common/list.h" #include "xmalloc.h" @@ -55,6 +56,7 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; +static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -1935,6 +1937,136 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +/* + * If a thread is a different cgroup set than the main thread in process, + * it means it is in a threaded controller. This daemon receives the cg_set + * number from the restored thread and move this thread to the correct + * cgroup controllers + */ +static int cgroupd(int sk) +{ + pr_info("cgroud: Daemon started\n"); + + while (1) { + struct unsc_msg um; + uns_call_t call; + pid_t tid; + int fd, cg_set, i; + CgSetEntry *cg_set_entry; + int ret; + + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("cgroupd: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &tid, &fd); + pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); + + cg_set_entry = find_rst_set_by_id(cg_set); + if (!cg_set_entry) { + pr_err("cgroupd: No set found %d\n", cg_set); + return -1; + } + + for (i = 0; i < cg_set_entry->n_ctls; i++) { + int j, aux_off; + CgMemberEntry *ce = cg_set_entry->ctls[i]; + char aux[PATH_MAX]; + CgControllerEntry *ctrl = NULL; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + /* + * This is not a threaded controller, all threads in this + * process must be in this controller. Main thread has been + * restored, so this thread is in this controller already. + */ + if (!ctrl->is_threaded) + continue; + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + + /* + * Cgroupd runs outside of the namespaces so we don't + * need to use userns_call here + */ + if (userns_move(aux, 0, tid)) { + pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); + return -1; + } + } + + /* + * We only want to send the cred which contains thread id back. + * The restored thread recvmsg(MSG_PEEK) until it gets its own + * thread id. + */ + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("cgroupd: send req error"); + return -1; + } + } + + return 0; +} + +int stop_cgroupd(void) +{ + if (cgroupd_pid) { + sigset_t blockmask, oldmask; + + /* + * Block the SIGCHLD signal to avoid triggering + * sigchld_handler() + */ + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + kill(cgroupd_pid, SIGTERM); + waitpid(cgroupd_pid, NULL, 0); + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + } + + return 0; +} + +static int prepare_cgroup_thread_sfd(void) +{ + int sk; + + sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); + if (sk < 0) { + pr_err("failed to start cgroupd\n"); + return -1; + } + + if (install_service_fd(CGROUPD_SK, sk) < 0) { + kill(cgroupd_pid, SIGKILL); + waitpid(cgroupd_pid, NULL, 0); + return -1; + } + + return 0; +} + static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); @@ -2089,15 +2221,19 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) + if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - else + if (ret < 0) + return ret; + ret = prepare_cgroup_thread_sfd(); + } else { ret = 0; + } return ret; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d7d3d8edb..78f2a9701 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1349,7 +1349,12 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - rsti(item)->cg_set = ca.core->tc->cg_set; + + /* Zombie task's cg_set is stored in task_core */ + if (item->pid->state == TASK_DEAD) + rsti(item)->cg_set = ca.core->tc->cg_set; + else + rsti(item)->cg_set = ca.core->thread_core->cg_set; if (ca.core->tc->has_stop_signo) item->pid->stop_signo = ca.core->tc->stop_signo; @@ -2376,6 +2381,10 @@ skip_ns_bouncing: if (ret < 0) goto out_kill; + ret = stop_cgroupd(); + if (ret < 0) + goto out_kill; + ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -3812,6 +3821,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + if (rsti(current)->cg_set != tcore->thread_core->cg_set) { + thread_args[i].cg_set = tcore->thread_core->cg_set; + thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); + } else { + thread_args[i].cg_set = -1; + } + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); if (ret) goto err; @@ -3906,6 +3922,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); + close_service_fd(CGROUPD_SK); __gcov_flush(); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 5a254559d..93f61539c 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -96,4 +96,6 @@ extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); +int stop_cgroupd(void); + #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index e2ea6e17f..183a3b852 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,6 +1,8 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ +#include <sys/socket.h> + #include "common/compiler.h" #include "files.h" #include "common/list.h" @@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); +extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); +extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); + #endif /* __CR_NS_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index d642765e3..bc0beb5cb 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -121,6 +121,8 @@ struct thread_restore_args { bool seccomp_force_tsync; char comm[TASK_COMM_LEN]; + int cg_set; + int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index c6979de7f..4265d94ed 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -24,6 +24,7 @@ enum sfd_type { */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, + CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ diff --git a/criu/namespaces.c b/criu/namespaces.c index 286073ff6..0dc19d5b6 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,7 +4,6 @@ #include <stdlib.h> #include <sys/prctl.h> #include <grp.h> -#include <sys/socket.h> #include <sys/un.h> #include <stdarg.h> #include <signal.h> @@ -1218,20 +1217,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - static int usernsd_pid; -static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) +inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; @@ -1269,7 +1257,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); - ucred->pid = getpid(); + if (pid) + ucred->pid = *pid; + else + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1284,7 +1275,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void } } -static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1322,7 +1313,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1367,7 +1358,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1418,7 +1409,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1433,7 +1424,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0); + unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1454,14 +1445,11 @@ out: return ret; } -static int start_usernsd(void) +int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - /* * Seqpacket to * @@ -1490,24 +1478,39 @@ static int start_usernsd(void) return -1; } - usernsd_pid = fork(); - if (usernsd_pid < 0) { - pr_perror("Can't fork usernsd"); + *pid = fork(); + if (*pid < 0) { + pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } - if (usernsd_pid == 0) { + if (*pid == 0) { int ret; - close(sk[0]); - ret = usernsd(sk[1]); + ret = daemon_func(sk[1]); exit(ret); } - close(sk[1]); - if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + + return sk[0]; +} + +static int start_usernsd(void) +{ + int sk; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + sk = start_unix_cred_daemon(&usernsd_pid, usernsd); + if (sk < 0) { + pr_err("failed to start usernsd\n"); + return -1; + } + + if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0e98cb3da..99cff1f7d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -17,6 +17,7 @@ #include <sys/resource.h> #include <signal.h> #include <sys/inotify.h> +#include <sys/socket.h> #include "linux/userfaultfd.h" @@ -586,6 +587,103 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig ARCH_RT_SIGRETURN(new_sp, sigframe); } +static int send_cg_set(int sk, int cg_set) +{ + struct cmsghdr *ch; + struct msghdr h; + /* + * 0th is the dummy call address for compatibility with userns helper + * 1st is the cg_set + */ + struct iovec iov[2]; + char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {}; + int ret, *dummy = NULL; + struct ucred *ucred; + + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + iov[1].iov_base = &cg_set; + iov[1].iov_len = sizeof(cg_set); + + h.msg_iov = iov; + h.msg_iovlen = sizeof(iov) / sizeof(struct iovec); + h.msg_name = NULL; + h.msg_namelen = 0; + h.msg_flags = 0; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_CREDENTIALS; + + ucred = (struct ucred *)CMSG_DATA(ch); + /* + * We still have privilege in this namespace so we can send + * thread id instead of pid of main thread, uid, gid as 0 + * since these 2 are ignored in cgroupd + */ + ucred->pid = sys_gettid(); + ucred->uid = 0; + ucred->gid = 0; + + ret = sys_sendmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to send packet to cgroupd %d\n", ret); + return -1; + } + + return 0; +} + +/* + * As this socket is shared among threads, recvmsg(MSG_PEEK) + * from the socket until getting its own thread id as an + * acknowledge of successful threaded cgroup fixup + */ +static int recv_cg_set_restore_ack(int sk) +{ + struct cmsghdr *ch; + struct msghdr h = {}; + char cmsg[CMSG_SPACE(sizeof(struct ucred))]; + struct ucred *cred; + int ret; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + + while (1) { + ret = sys_recvmsg(sk, &h, MSG_PEEK); + if (ret < 0) { + pr_err("Unable to peek from cgroupd %d\n", ret); + return -1; + } + + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } + + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) + continue; + + /* + * Actual remove message from recv queue of socket + */ + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } + + break; + } + return 0; +} + /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. @@ -613,6 +711,15 @@ long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; + if (args->cg_set != -1) { + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); + if (send_cg_set(args->cgroupd_sk, args->cg_set)) + goto core_restore_end; + if (recv_cg_set_restore_ack(args->cgroupd_sk)) + goto core_restore_end; + sys_close(args->cgroupd_sk); + } + if (restore_thread_common(args)) goto core_restore_end; |