Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@parallels.com>2015-02-13 15:05:24 +0300
committerPavel Emelyanov <xemul@parallels.com>2015-02-13 15:11:38 +0300
commitb8556e8084981f9148749e7777a3b257cc94b0bd (patch)
treeae82b28974530cdfcd32ad302bd1094502c1a1da
parentbb0a6f0cb60312f51a4d38f8854f39c942670d29 (diff)
usernsd: The way to restore priviledged stuff in userns
We have collected a good set of calls that cannot be done inside user namespaces, but we need to [1]. Some of them has already being addressed, like prctl mm bits restore, but some are not. I'm pretty sceptical about the ability to relax the security checks on quite a lot of them (e.g. open-by-handle is indeed a very dangerous operation if allowed to unpriviledged user), so we need some way to call those things even in user namespaces. The good news about it its that all the calls I've found operate on file descriptors this way or another. So if we had a process, that lived outside of user namespace, we could ask one to do the high priority operation we need and exchange the affected file descriptor via unix socket. So the usernsd is the one doing exactly this. It starts before we create the user namespace and accepts requests via unix socket. Clients (the processes we restore) send him the functions they want to call, the descriptor they want to operate on and the arguments blob. Optionally, they can request some file descriptor back after the call. In non usernamespace case the daemon is not started and the calls are done right in the requestor's process environment. In the next patch there's an example of how to use this daemon to do the priviledged SO_SNDBUFFORCE/_RCVBUFFORCE sockopt on a socket. [1] http://criu.org/UserNamespace Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Acked-by: Andrew Vagin <avagin@openvz.org>
-rw-r--r--cr-restore.c10
-rw-r--r--include/namespaces.h29
-rw-r--r--include/rst_info.h1
-rw-r--r--include/servicefd.h1
-rw-r--r--namespaces.c322
5 files changed, 363 insertions, 0 deletions
diff --git a/cr-restore.c b/cr-restore.c
index 95e6fd127..2afdb452b 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -1712,6 +1712,9 @@ static int restore_root_task(struct pstree_item *init)
return -1;
}
+ if (start_usernsd())
+ return -1;
+
futex_set(&task_entries->nr_in_progress,
stage_participants(CR_STATE_RESTORE_NS));
@@ -1775,6 +1778,10 @@ static int restore_root_task(struct pstree_item *init)
if (ret < 0)
goto out_kill;
+ ret = stop_usernsd();
+ if (ret < 0)
+ goto out_kill;
+
ret = move_veth_to_bridge();
if (ret < 0)
goto out_kill;
@@ -1849,6 +1856,7 @@ out_kill:
}
out:
+ stop_usernsd();
__restore_switch_stage(CR_STATE_FAIL);
pr_err("Restoring FAILED.\n");
return -1;
@@ -1868,6 +1876,7 @@ static int prepare_task_entries(void)
task_entries->nr_helpers = 0;
futex_set(&task_entries->start, CR_STATE_RESTORE_NS);
mutex_init(&task_entries->zombie_lock);
+ mutex_init(&task_entries->userns_sync_lock);
return 0;
}
@@ -2955,6 +2964,7 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
close_image_dir();
close_proc();
close_service_fd(ROOT_FD_OFF);
+ close_service_fd(USERNSD_SK);
__gcov_flush();
diff --git a/include/namespaces.h b/include/namespaces.h
index d68a6100d..52d2f34f5 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -73,9 +73,38 @@ extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
extern int collect_user_namespaces(bool for_dump);
extern int prepare_userns(struct pstree_item *item);
+extern int start_usernsd(void);
+extern int stop_usernsd(void);
extern int userns_uid(int uid);
extern int userns_gid(int gid);
extern int dump_user_ns(pid_t pid, int ns_id);
extern void free_userns_maps(void);
+typedef int (*uns_call_t)(void *arg, int fd);
+/*
+ * Async call -- The call is guaranteed to be done till the
+ * CR_STATE_COMPLETE happens. The function may return even
+ * before the call starts.
+ * W/o flag the call is synchronous -- this function returns
+ * strictly after the call finishes.
+ */
+#define UNS_ASYNC 0x1
+/*
+ * The call returns an FD which should be sent back. Conflicts
+ * with UNS_ASYNC.
+ */
+#define UNS_FDOUT 0x2
+
+/*
+ * When we're restoring inside user namespace, some things are
+ * not allowed to be done there due to insufficient capabilities.
+ * If the operation in question can be offloaded to another process,
+ * this call allows to do that.
+ *
+ * In case we're not in userns, just call the callback immediatelly
+ * in the context of calling task.
+ */
+int userns_call(uns_call_t call, int flags,
+ void *arg, size_t arg_size, int fd);
+
#endif /* __CR_NS_H__ */
diff --git a/include/rst_info.h b/include/rst_info.h
index d509c0c21..f8d3ca8e0 100644
--- a/include/rst_info.h
+++ b/include/rst_info.h
@@ -11,6 +11,7 @@ struct task_entries {
futex_t start;
mutex_t zombie_lock;
atomic_t cr_err;
+ mutex_t userns_sync_lock;
};
struct fdt {
diff --git a/include/servicefd.h b/include/servicefd.h
index bdadc0fcf..3c6e08a74 100644
--- a/include/servicefd.h
+++ b/include/servicefd.h
@@ -17,6 +17,7 @@ enum sfd_type {
*/
ROOT_FD_OFF, /* Root of the namespace we dump/restore */
CGROUP_YARD,
+ USERNSD_SK, /* Socket for usernsd */
SERVICE_FD_MAX
};
diff --git a/namespaces.c b/namespaces.c
index c1cf06976..eab4d1798 100644
--- a/namespaces.c
+++ b/namespaces.c
@@ -4,6 +4,10 @@
#include <stdlib.h>
#include <sys/prctl.h>
#include <grp.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdarg.h>
+#include <signal.h>
#include "cr-show.h"
#include "util.h"
@@ -835,6 +839,324 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map)
return 0;
}
+struct unsc_msg {
+ struct msghdr h;
+ /*
+ * 0th is the call address
+ * 1st is the flags
+ * 2nd is the optional (NULL in responce) arguments
+ */
+ struct iovec iov[3];
+ char c[CMSG_SPACE(sizeof(int))];
+};
+
+#define MAX_MSG_SIZE 256
+
+static int usernsd_pid;
+
+static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c,
+ int *x, void *arg, size_t asize, int fd)
+{
+ m->h.msg_iov = m->iov;
+ m->h.msg_iovlen = 2;
+
+ m->iov[0].iov_base = c;
+ m->iov[0].iov_len = sizeof(*c);
+ m->iov[1].iov_base = x;
+ m->iov[1].iov_len = sizeof(*x);
+
+ if (arg) {
+ m->iov[2].iov_base = arg;
+ m->iov[2].iov_len = asize;
+ m->h.msg_iovlen++;
+ }
+
+ m->h.msg_name = NULL;
+ m->h.msg_namelen = 0;
+ m->h.msg_flags = 0;
+
+ if (fd < 0) {
+ m->h.msg_control = NULL;
+ m->h.msg_controllen = 0;
+ } else {
+ struct cmsghdr *ch;
+
+ m->h.msg_control = &m->c;
+ m->h.msg_controllen = sizeof(m->c);
+ ch = CMSG_FIRSTHDR(&m->h);
+ ch->cmsg_len = CMSG_LEN(sizeof(int));
+ ch->cmsg_level = SOL_SOCKET;
+ ch->cmsg_type = SCM_RIGHTS;
+ *((int *)CMSG_DATA(ch)) = fd;
+ }
+}
+
+static int unsc_msg_fd(struct unsc_msg *um)
+{
+ struct cmsghdr *ch;
+
+ ch = CMSG_FIRSTHDR(&um->h);
+ if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) {
+ BUG_ON(ch->cmsg_level != SOL_SOCKET);
+ BUG_ON(ch->cmsg_type != SCM_RIGHTS);
+ return *((int *)CMSG_DATA(ch));
+ }
+
+ return -1;
+}
+
+static int usernsd(int sk)
+{
+ pr_info("UNS: Daemon started\n");
+
+ while (1) {
+ struct unsc_msg um;
+ static char msg[MAX_MSG_SIZE];
+ uns_call_t call;
+ int flags, fd, ret;
+
+ unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);
+ if (recvmsg(sk, &um.h, 0) <= 0) {
+ pr_perror("UNS: recv req error");
+ return -1;
+ }
+
+ fd = unsc_msg_fd(&um);
+ pr_debug("UNS: daemon calls %p (%d, %x)\n", call, fd, flags);
+
+ /*
+ * Caller has sent us bare address of the routine it
+ * wants to call. Since the caller is fork()-ed from the
+ * same process as the daemon is, the latter has exactly
+ * the same code at exactly the same address as the
+ * former guy has. So go ahead and just call one!
+ */
+
+ ret = call(msg, fd);
+
+ if (fd >= 0)
+ close(fd);
+
+ if (flags & UNS_ASYNC) {
+ /*
+ * Async call failed and the called doesn't know
+ * about it. Exit now and let the stop_usernsd()
+ * check the exit code and abort the restoration.
+ *
+ * We'd get there either by the end of restore or
+ * from the next userns_call() due to failed
+ * sendmsg() in there.
+ */
+ if (ret < 0) {
+ pr_err("UNS: Async call failed. Exiting\n");
+ return -1;
+ }
+
+ continue;
+ }
+
+ if (flags & UNS_FDOUT)
+ fd = ret;
+ else
+ fd = -1;
+
+ unsc_msg_init(&um, &call, &ret, NULL, 0, fd);
+ if (sendmsg(sk, &um.h, 0) <= 0) {
+ pr_perror("UNS: send resp error");
+ return -1;
+ }
+
+ if (fd >= 0)
+ close(fd);
+ }
+}
+
+int userns_call(uns_call_t call, int flags,
+ void *arg, size_t arg_size, int fd)
+{
+ int ret, res, sk;
+ bool async = flags & UNS_ASYNC;
+ struct unsc_msg um;
+
+ if (unlikely(arg_size > MAX_MSG_SIZE)) {
+ pr_err("UNS: message size exceeded\n");
+ return -1;
+ }
+
+ if (!usernsd_pid)
+ return call(arg, fd);
+
+ sk = get_service_fd(USERNSD_SK);
+ pr_debug("UNS: calling %p (%d, %x)\n", call, fd, flags);
+
+ if (!async)
+ /*
+ * Why don't we lock for async requests? Because
+ * they just put the request in the daemon's
+ * queue and do not wait for the responce. Thus
+ * when daemon responce there's only one client
+ * waiting for it in recvmsg below, so he
+ * responces to proper caller.
+ */
+ mutex_lock(&task_entries->userns_sync_lock);
+ else
+ /*
+ * If we want the callback to give us and FD then
+ * we should NOT do the asynchronous call.
+ */
+ BUG_ON(flags & UNS_FDOUT);
+
+ /* Send the request */
+
+ unsc_msg_init(&um, &call, &flags, arg, arg_size, fd);
+ ret = sendmsg(sk, &um.h, 0);
+ if (ret <= 0) {
+ pr_perror("UNS: send req error");
+ ret = -1;
+ goto out;
+ }
+
+ if (async) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Get the responce back */
+
+ unsc_msg_init(&um, &call, &res, NULL, 0, 0);
+ ret = recvmsg(sk, &um.h, 0);
+ if (ret <= 0) {
+ pr_perror("UNS: recv resp error");
+ ret = -1;
+ goto out;
+ }
+
+ /* Decode the result and return */
+
+ if (flags & UNS_FDOUT)
+ ret = unsc_msg_fd(&um);
+ else
+ ret = res;
+out:
+ if (!async)
+ mutex_unlock(&task_entries->userns_sync_lock);
+
+ return ret;
+}
+
+int start_usernsd(void)
+{
+ int sk[2];
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return 0;
+
+ /*
+ * Seqpacket to
+ *
+ * a) Help daemon distinguish individual requests from
+ * each other easily. Stream socket require manual
+ * messages boundaries.
+ *
+ * b) Make callers note the damon death by seeing the
+ * disconnected socket. In case of dgram socket
+ * callers would just get stuck in receiving the
+ * responce.
+ */
+
+ if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) {
+ pr_perror("Can't make usernsd socket");
+ return -1;
+ }
+
+ usernsd_pid = fork();
+ if (usernsd_pid < 0) {
+ pr_perror("Can't fork usernsd");
+ close(sk[0]);
+ close(sk[1]);
+ return -1;
+ }
+
+ if (usernsd_pid == 0) {
+ int ret;
+
+ close(sk[0]);
+ ret = usernsd(sk[1]);
+ exit(ret);
+ }
+
+ close(sk[1]);
+ if (install_service_fd(USERNSD_SK, sk[0]) < 0) {
+ kill(usernsd_pid, SIGKILL);
+ waitpid(usernsd_pid, NULL, 0);
+ close(sk[0]);
+ return -1;
+ }
+
+ close(sk[0]);
+ return 0;
+}
+
+static int exit_usernsd(void *arg, int fd)
+{
+ int code = *(int *)arg;
+ pr_info("UNS: `- daemon exits w/ %d\n", code);
+ exit(code);
+}
+
+int stop_usernsd(void)
+{
+ int ret = 0;
+
+ if (usernsd_pid) {
+ int status = -1;
+ sigset_t blockmask, oldmask;
+
+ /*
+ * Don't let the sigchld_handler() mess with us
+ * calling waitpid() on the exited daemon. The
+ * same is done in cr_system().
+ */
+
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGCHLD);
+ sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+ /*
+ * Send a message to make sure the daemon _has_
+ * proceeded all its queue of asynchronous requests.
+ *
+ * All the restoring processes might have already
+ * closed their USERNSD_SK descriptors, but daemon
+ * still has its in connected state -- this is us
+ * who hold the last reference on the peer.
+ *
+ * If daemon has exited "in advance" due to async
+ * call or socket error, the userns_call() and the
+ * waitpid() below would both fail and we'll see
+ * bad exit status.
+ */
+
+ userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1);
+ waitpid(usernsd_pid, &status, 0);
+
+ if (WIFEXITED(status))
+ ret = WEXITSTATUS(status);
+ else
+ ret = -1;
+
+ usernsd_pid = 0;
+ sigprocmask(SIG_BLOCK, &oldmask, NULL);
+
+ if (ret != 0)
+ pr_err("UNS: daemon exited abnormally\n");
+ else
+ pr_info("UNS: daemon stopped\n");
+ }
+
+ return ret;
+}
+
int prepare_userns(struct pstree_item *item)
{
struct cr_img *img;