Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBui Quang Minh <minhquangbui99@gmail.com>2022-09-04 11:41:17 +0300
committerAndrei Vagin <avagin@gmail.com>2022-11-02 07:35:04 +0300
commitda84213352ee4863e1739fd2b4da1e177426f98a (patch)
tree2bfa77d589fd8425be61463031b11c651a8c59db
parentc3a519272881bc36243d142427b8d9f2ddc46f82 (diff)
cgroup-v2: Restore threads in a process into correct threaded controllers
As threads in a process may be in different threaded controllers, we need to move thoses threads to the correct controllers. Because the threads of a process are restored in later stage in restorer.c, we need to create a cgroupd service to help to move those threads into correct controllers when they are restored. We cannot use usernsd as the code in restorer does not know the address of outside function to pass to userns_call. However, this cgroupd service still reuses a lot of code from usernsd. The main logic is that restored threads receive the cg_set number they belong to before restorer stage in case their cg_set are different from main thread. When these threads are restored, they send the cg_set number and their thread ids through unix socket to cgroupd. cgroupd receives the cg_set number and thread ids and moves those threads into correct controllers. Thread ids are sent through SCM_CREDENTIALS of unix socket so they are translated into correct thread ids in the receiving end. Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
-rw-r--r--criu/cgroup.c140
-rw-r--r--criu/cr-restore.c19
-rw-r--r--criu/include/cgroup.h2
-rw-r--r--criu/include/namespaces.h17
-rw-r--r--criu/include/restorer.h2
-rw-r--r--criu/include/servicefd.h1
-rw-r--r--criu/namespaces.c65
-rw-r--r--criu/pie/restorer.c107
8 files changed, 319 insertions, 34 deletions
diff --git a/criu/cgroup.c b/criu/cgroup.c
index b238b6402..918827d99 100644
--- a/criu/cgroup.c
+++ b/criu/cgroup.c
@@ -8,6 +8,7 @@
#include <ftw.h>
#include <libgen.h>
#include <sched.h>
+#include <sys/wait.h>
#include "common/list.h"
#include "xmalloc.h"
@@ -55,6 +56,7 @@ static u32 cg_set_ids = 1;
static LIST_HEAD(cgroups);
static unsigned int n_cgroups;
+static pid_t cgroupd_pid;
static CgSetEntry *find_rst_set_by_id(u32 id)
{
@@ -1935,6 +1937,136 @@ static int prepare_cgroup_sfd(CgroupEntry *ce)
return 0;
}
+/*
+ * If a thread is a different cgroup set than the main thread in process,
+ * it means it is in a threaded controller. This daemon receives the cg_set
+ * number from the restored thread and move this thread to the correct
+ * cgroup controllers
+ */
+static int cgroupd(int sk)
+{
+ pr_info("cgroud: Daemon started\n");
+
+ while (1) {
+ struct unsc_msg um;
+ uns_call_t call;
+ pid_t tid;
+ int fd, cg_set, i;
+ CgSetEntry *cg_set_entry;
+ int ret;
+
+ unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL);
+ ret = recvmsg(sk, &um.h, 0);
+ if (ret <= 0) {
+ pr_perror("cgroupd: recv req error");
+ return -1;
+ }
+
+ unsc_msg_pid_fd(&um, &tid, &fd);
+ pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set);
+
+ cg_set_entry = find_rst_set_by_id(cg_set);
+ if (!cg_set_entry) {
+ pr_err("cgroupd: No set found %d\n", cg_set);
+ return -1;
+ }
+
+ for (i = 0; i < cg_set_entry->n_ctls; i++) {
+ int j, aux_off;
+ CgMemberEntry *ce = cg_set_entry->ctls[i];
+ char aux[PATH_MAX];
+ CgControllerEntry *ctrl = NULL;
+
+ for (j = 0; j < n_controllers; j++) {
+ CgControllerEntry *cur = controllers[j];
+ if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) {
+ ctrl = cur;
+ break;
+ }
+ }
+
+ if (!ctrl) {
+ pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path);
+ return -1;
+ }
+
+ /*
+ * This is not a threaded controller, all threads in this
+ * process must be in this controller. Main thread has been
+ * restored, so this thread is in this controller already.
+ */
+ if (!ctrl->is_threaded)
+ continue;
+
+ aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
+ snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path);
+
+ /*
+ * Cgroupd runs outside of the namespaces so we don't
+ * need to use userns_call here
+ */
+ if (userns_move(aux, 0, tid)) {
+ pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path);
+ return -1;
+ }
+ }
+
+ /*
+ * We only want to send the cred which contains thread id back.
+ * The restored thread recvmsg(MSG_PEEK) until it gets its own
+ * thread id.
+ */
+ unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid);
+ if (sendmsg(sk, &um.h, 0) <= 0) {
+ pr_perror("cgroupd: send req error");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int stop_cgroupd(void)
+{
+ if (cgroupd_pid) {
+ sigset_t blockmask, oldmask;
+
+ /*
+ * Block the SIGCHLD signal to avoid triggering
+ * sigchld_handler()
+ */
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGCHLD);
+ sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+ kill(cgroupd_pid, SIGTERM);
+ waitpid(cgroupd_pid, NULL, 0);
+
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+ }
+
+ return 0;
+}
+
+static int prepare_cgroup_thread_sfd(void)
+{
+ int sk;
+
+ sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd);
+ if (sk < 0) {
+ pr_err("failed to start cgroupd\n");
+ return -1;
+ }
+
+ if (install_service_fd(CGROUPD_SK, sk) < 0) {
+ kill(cgroupd_pid, SIGKILL);
+ waitpid(cgroupd_pid, NULL, 0);
+ return -1;
+ }
+
+ return 0;
+}
+
static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot)
{
size_t dirlen = strlen(*dir_name);
@@ -2089,15 +2221,19 @@ int prepare_cgroup(void)
n_controllers = ce->n_controllers;
controllers = ce->controllers;
- if (n_sets)
+ if (n_sets) {
/*
* We rely on the fact that all sets contain the same
* set of controllers. This is checked during dump
* with cg_set_compare(CGCMP_ISSUB) call.
*/
ret = prepare_cgroup_sfd(ce);
- else
+ if (ret < 0)
+ return ret;
+ ret = prepare_cgroup_thread_sfd();
+ } else {
ret = 0;
+ }
return ret;
}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index d7d3d8edb..78f2a9701 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1349,7 +1349,12 @@ static inline int fork_with_pid(struct pstree_item *item)
return -1;
item->pid->state = ca.core->tc->task_state;
- rsti(item)->cg_set = ca.core->tc->cg_set;
+
+ /* Zombie task's cg_set is stored in task_core */
+ if (item->pid->state == TASK_DEAD)
+ rsti(item)->cg_set = ca.core->tc->cg_set;
+ else
+ rsti(item)->cg_set = ca.core->thread_core->cg_set;
if (ca.core->tc->has_stop_signo)
item->pid->stop_signo = ca.core->tc->stop_signo;
@@ -2376,6 +2381,10 @@ skip_ns_bouncing:
if (ret < 0)
goto out_kill;
+ ret = stop_cgroupd();
+ if (ret < 0)
+ goto out_kill;
+
ret = move_veth_to_bridge();
if (ret < 0)
goto out_kill;
@@ -3812,6 +3821,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
core_get_tls(tcore, &thread_args[i].tls);
+ if (rsti(current)->cg_set != tcore->thread_core->cg_set) {
+ thread_args[i].cg_set = tcore->thread_core->cg_set;
+ thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK));
+ } else {
+ thread_args[i].cg_set = -1;
+ }
+
ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core);
if (ret)
goto err;
@@ -3906,6 +3922,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
close_service_fd(USERNSD_SK);
close_service_fd(FDSTORE_SK_OFF);
close_service_fd(RPC_SK_OFF);
+ close_service_fd(CGROUPD_SK);
__gcov_flush();
diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h
index 5a254559d..93f61539c 100644
--- a/criu/include/cgroup.h
+++ b/criu/include/cgroup.h
@@ -96,4 +96,6 @@ extern void put_ctls(struct list_head *);
int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups);
+int stop_cgroupd(void);
+
#endif /* __CR_CGROUP_H__ */
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
index e2ea6e17f..183a3b852 100644
--- a/criu/include/namespaces.h
+++ b/criu/include/namespaces.h
@@ -1,6 +1,8 @@
#ifndef __CR_NS_H__
#define __CR_NS_H__
+#include <sys/socket.h>
+
#include "common/compiler.h"
#include "files.h"
#include "common/list.h"
@@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data);
extern struct ns_id *get_socket_ns(int lfd);
extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd);
+struct unsc_msg {
+ struct msghdr h;
+ /*
+ * 0th is the call address
+ * 1st is the flags
+ * 2nd is the optional (NULL in response) arguments
+ */
+ struct iovec iov[3];
+ char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
+};
+
+extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid);
+extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd);
+extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk));
+
#endif /* __CR_NS_H__ */
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index d642765e3..bc0beb5cb 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -121,6 +121,8 @@ struct thread_restore_args {
bool seccomp_force_tsync;
char comm[TASK_COMM_LEN];
+ int cg_set;
+ int cgroupd_sk;
} __aligned(64);
typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args);
diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h
index c6979de7f..4265d94ed 100644
--- a/criu/include/servicefd.h
+++ b/criu/include/servicefd.h
@@ -24,6 +24,7 @@ enum sfd_type {
*/
ROOT_FD_OFF, /* Root of the namespace we dump/restore */
CGROUP_YARD,
+ CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */
USERNSD_SK, /* Socket for usernsd */
NS_FD_OFF, /* Node's net namespace fd */
TRANSPORT_FD_OFF, /* to transfer file descriptors */
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 286073ff6..0dc19d5b6 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -4,7 +4,6 @@
#include <stdlib.h>
#include <sys/prctl.h>
#include <grp.h>
-#include <sys/socket.h>
#include <sys/un.h>
#include <stdarg.h>
#include <signal.h>
@@ -1218,20 +1217,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map)
return 0;
}
-struct unsc_msg {
- struct msghdr h;
- /*
- * 0th is the call address
- * 1st is the flags
- * 2nd is the optional (NULL in response) arguments
- */
- struct iovec iov[3];
- char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
-};
-
static int usernsd_pid;
-static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd)
+inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid)
{
struct cmsghdr *ch;
struct ucred *ucred;
@@ -1269,7 +1257,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void
ch->cmsg_type = SCM_CREDENTIALS;
ucred = (struct ucred *)CMSG_DATA(ch);
- ucred->pid = getpid();
+ if (pid)
+ ucred->pid = *pid;
+ else
+ ucred->pid = getpid();
ucred->uid = getuid();
ucred->gid = getgid();
@@ -1284,7 +1275,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void
}
}
-static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd)
+void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd)
{
struct cmsghdr *ch;
struct ucred *ucred;
@@ -1322,7 +1313,7 @@ static int usernsd(int sk)
int flags, fd, ret;
pid_t pid;
- unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);
+ unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL);
if (recvmsg(sk, &um.h, 0) <= 0) {
pr_perror("uns: recv req error");
return -1;
@@ -1367,7 +1358,7 @@ static int usernsd(int sk)
else
fd = -1;
- unsc_msg_init(&um, &call, &ret, NULL, 0, fd);
+ unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL);
if (sendmsg(sk, &um.h, 0) <= 0) {
pr_perror("uns: send resp error");
return -1;
@@ -1418,7 +1409,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg,
/* Send the request */
- unsc_msg_init(&um, &call, &flags, arg, arg_size, fd);
+ unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL);
ret = sendmsg(sk, &um.h, 0);
if (ret <= 0) {
pr_perror("uns: send req error");
@@ -1433,7 +1424,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg,
/* Get the response back */
- unsc_msg_init(&um, &call, &res, NULL, 0, 0);
+ unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL);
ret = recvmsg(sk, &um.h, 0);
if (ret <= 0) {
pr_perror("uns: recv resp error");
@@ -1454,14 +1445,11 @@ out:
return ret;
}
-static int start_usernsd(void)
+int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk))
{
int sk[2];
int one = 1;
- if (!(root_ns_mask & CLONE_NEWUSER))
- return 0;
-
/*
* Seqpacket to
*
@@ -1490,24 +1478,39 @@ static int start_usernsd(void)
return -1;
}
- usernsd_pid = fork();
- if (usernsd_pid < 0) {
- pr_perror("Can't fork usernsd");
+ *pid = fork();
+ if (*pid < 0) {
+ pr_perror("Can't unix daemon");
close(sk[0]);
close(sk[1]);
return -1;
}
- if (usernsd_pid == 0) {
+ if (*pid == 0) {
int ret;
-
close(sk[0]);
- ret = usernsd(sk[1]);
+ ret = daemon_func(sk[1]);
exit(ret);
}
-
close(sk[1]);
- if (install_service_fd(USERNSD_SK, sk[0]) < 0) {
+
+ return sk[0];
+}
+
+static int start_usernsd(void)
+{
+ int sk;
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return 0;
+
+ sk = start_unix_cred_daemon(&usernsd_pid, usernsd);
+ if (sk < 0) {
+ pr_err("failed to start usernsd\n");
+ return -1;
+ }
+
+ if (install_service_fd(USERNSD_SK, sk) < 0) {
kill(usernsd_pid, SIGKILL);
waitpid(usernsd_pid, NULL, 0);
return -1;
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 0e98cb3da..99cff1f7d 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -17,6 +17,7 @@
#include <sys/resource.h>
#include <signal.h>
#include <sys/inotify.h>
+#include <sys/socket.h>
#include "linux/userfaultfd.h"
@@ -586,6 +587,103 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig
ARCH_RT_SIGRETURN(new_sp, sigframe);
}
+static int send_cg_set(int sk, int cg_set)
+{
+ struct cmsghdr *ch;
+ struct msghdr h;
+ /*
+ * 0th is the dummy call address for compatibility with userns helper
+ * 1st is the cg_set
+ */
+ struct iovec iov[2];
+ char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {};
+ int ret, *dummy = NULL;
+ struct ucred *ucred;
+
+ iov[0].iov_base = &dummy;
+ iov[0].iov_len = sizeof(dummy);
+ iov[1].iov_base = &cg_set;
+ iov[1].iov_len = sizeof(cg_set);
+
+ h.msg_iov = iov;
+ h.msg_iovlen = sizeof(iov) / sizeof(struct iovec);
+ h.msg_name = NULL;
+ h.msg_namelen = 0;
+ h.msg_flags = 0;
+
+ h.msg_control = cmsg;
+ h.msg_controllen = sizeof(cmsg);
+ ch = CMSG_FIRSTHDR(&h);
+ ch->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+ ch->cmsg_level = SOL_SOCKET;
+ ch->cmsg_type = SCM_CREDENTIALS;
+
+ ucred = (struct ucred *)CMSG_DATA(ch);
+ /*
+ * We still have privilege in this namespace so we can send
+ * thread id instead of pid of main thread, uid, gid as 0
+ * since these 2 are ignored in cgroupd
+ */
+ ucred->pid = sys_gettid();
+ ucred->uid = 0;
+ ucred->gid = 0;
+
+ ret = sys_sendmsg(sk, &h, 0);
+ if (ret < 0) {
+ pr_err("Unable to send packet to cgroupd %d\n", ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * As this socket is shared among threads, recvmsg(MSG_PEEK)
+ * from the socket until getting its own thread id as an
+ * acknowledge of successful threaded cgroup fixup
+ */
+static int recv_cg_set_restore_ack(int sk)
+{
+ struct cmsghdr *ch;
+ struct msghdr h = {};
+ char cmsg[CMSG_SPACE(sizeof(struct ucred))];
+ struct ucred *cred;
+ int ret;
+
+ h.msg_control = cmsg;
+ h.msg_controllen = sizeof(cmsg);
+
+ while (1) {
+ ret = sys_recvmsg(sk, &h, MSG_PEEK);
+ if (ret < 0) {
+ pr_err("Unable to peek from cgroupd %d\n", ret);
+ return -1;
+ }
+
+ if (h.msg_controllen != sizeof(cmsg)) {
+ pr_err("The message from cgroupd is truncated\n");
+ return -1;
+ }
+
+ ch = CMSG_FIRSTHDR(&h);
+ cred = (struct ucred *)CMSG_DATA(ch);
+ if (cred->pid != sys_gettid())
+ continue;
+
+ /*
+ * Actual remove message from recv queue of socket
+ */
+ ret = sys_recvmsg(sk, &h, 0);
+ if (ret < 0) {
+ pr_err("Unable to receive from cgroupd %d\n", ret);
+ return -1;
+ }
+
+ break;
+ }
+ return 0;
+}
+
/*
* Threads restoration via sigreturn. Note it's locked
* routine and calls for unlock at the end.
@@ -613,6 +711,15 @@ long __export_restore_thread(struct thread_restore_args *args)
rt_sigframe = (void *)&args->mz->rt_sigframe;
+ if (args->cg_set != -1) {
+ pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set);
+ if (send_cg_set(args->cgroupd_sk, args->cg_set))
+ goto core_restore_end;
+ if (recv_cg_set_restore_ack(args->cgroupd_sk))
+ goto core_restore_end;
+ sys_close(args->cgroupd_sk);
+ }
+
if (restore_thread_common(args))
goto core_restore_end;