Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYounes Manton <ymanton@ca.ibm.com>2022-08-12 21:56:53 +0300
committerAndrei Vagin <avagin@gmail.com>2022-10-25 17:26:42 +0300
commit3b5f5c7d485964500215ba74b874195b60df85fd (patch)
treec42bd68928a6044dd43f8d77b0dc960103d897e4
parentde70d2c9c10daac00d8e7c0f20da33eb31c48993 (diff)
non-root: enable non-root checkpoint/restore
This commit enables checkpointing and restoring of applications as non-root. First goal was to enable checkpoint and restore of the env00 and pthread00 test case. This uses the information from opts.unprivileged and opts.cap_eff to skip certain code paths which do not work as non-root. Co-authored-by: Adrian Reber <areber@redhat.com> Signed-off-by: Younes Manton <ymanton@ca.ibm.com>
-rw-r--r--criu/cgroup.c6
-rw-r--r--criu/config.c1
-rw-r--r--criu/cr-check.c71
-rw-r--r--criu/cr-restore.c3
-rw-r--r--criu/cr-service.c7
-rw-r--r--criu/crtools.c5
-rw-r--r--criu/fdstore.c16
-rw-r--r--criu/files.c46
-rw-r--r--criu/image.c3
-rw-r--r--criu/include/cr_options.h11
-rw-r--r--criu/include/util.h2
-rw-r--r--criu/namespaces.c11
-rw-r--r--criu/pie/restorer.c26
-rw-r--r--criu/timens.c4
-rw-r--r--criu/util.c22
-rw-r--r--images/rpc.proto1
-rw-r--r--lib/c/criu.c11
-rw-r--r--lib/c/criu.h1
18 files changed, 194 insertions, 53 deletions
diff --git a/criu/cgroup.c b/criu/cgroup.c
index 325df6a1d..d886ce9f2 100644
--- a/criu/cgroup.c
+++ b/criu/cgroup.c
@@ -734,6 +734,9 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_
unsigned int n_ctls = 0;
struct cg_set *cs;
+ if (opts.unprivileged)
+ return 0;
+
if (item)
pid = item->pid->real;
else
@@ -989,6 +992,9 @@ int dump_cgroups(void)
CgroupEntry cg = CGROUP_ENTRY__INIT;
int ret = -1;
+ if (opts.unprivileged)
+ return 0;
+
BUG_ON(!criu_cgset || !root_cgset);
/*
diff --git a/criu/config.c b/criu/config.c
index c078848ec..9ba79c8ef 100644
--- a/criu/config.c
+++ b/criu/config.c
@@ -700,6 +700,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd,
{ "lsm-mount-context", required_argument, 0, 1099 },
{ "network-lock", required_argument, 0, 1100 },
BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode),
+ BOOL_OPT("unprivileged", &opts.unprivileged),
{},
};
diff --git a/criu/cr-check.c b/criu/cr-check.c
index b90e6a9bf..b54c79387 100644
--- a/criu/cr-check.c
+++ b/criu/cr-check.c
@@ -21,6 +21,7 @@
#include <sys/prctl.h>
#include <sched.h>
#include <sys/mount.h>
+#include <sys/utsname.h>
#include "../soccr/soccr.h"
@@ -515,6 +516,14 @@ static int check_ipc(void)
{
int ret;
+ /*
+ * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however
+ * for non-root users access() runs with an empty set of caps and will therefore always
+ * fail.
+ */
+ if (opts.uid)
+ return 0;
+
ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK);
if (!ret)
return 0;
@@ -1039,10 +1048,14 @@ static int check_tcp(void)
}
val = 1;
- ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val));
- if (ret < 0) {
- pr_perror("Can't turn TCP repair mode ON");
- goto out;
+ if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) {
+ ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val));
+ if (ret < 0) {
+ pr_perror("Can't turn TCP repair mode ON");
+ goto out;
+ }
+ } else {
+ pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n");
}
optlen = sizeof(val);
@@ -1394,9 +1407,6 @@ int cr_check(void)
struct ns_id *ns;
int ret = 0;
- if (!is_root_user())
- return -1;
-
root_item = alloc_pstree_item();
if (root_item == NULL)
return -1;
@@ -1666,36 +1676,43 @@ static int pr_set_dumpable(int value)
int check_caps(void)
{
- struct proc_status_creds creds;
- int exit_code = -1;
-
- if (parse_pid_status(PROC_SELF, &creds.s, NULL))
+ /* Read out effective capabilities and store in opts.cap_eff. */
+ if (set_opts_cap_eff())
goto out;
- memcpy(&opts.cap_eff, &creds.cap_eff, sizeof(u32) * PROC_CAP_SIZE);
-
+ /*
+ * No matter if running as root or not. CRIU always needs
+ * at least these capabilities.
+ */
if (!has_cap_checkpoint_restore(opts.cap_eff))
goto out;
/* For some things we need to know if we are running as root. */
opts.uid = geteuid();
- if (opts.uid) {
- /*
- * At his point we know we are running as non-root with the necessary
- * capabilities available. Now we have to make the process dumpable
- * so that /proc/self is not owned by root.
- */
- if (pr_set_dumpable(1))
- return -1;
+ if (!opts.uid) {
+ /* CRIU is running as root. No further checks are necessary. */
+ return 0;
}
- exit_code = 0;
-out:
- if (exit_code) {
- pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n");
- pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0);
+ if (!opts.unprivileged) {
+ pr_msg("Running as non-root requires '--unprivileged'\n");
+ pr_msg("Please consult the documentation for limitations when running as non-root\n");
+ return -1;
}
- return exit_code;
+ /*
+ * At his point we know we are running as non-root with the necessary
+ * capabilities available. Now we have to make the process dumpable
+ * so that /proc/self is not owned by root.
+ */
+ if (pr_set_dumpable(1))
+ return -1;
+
+ return 0;
+out:
+ pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n");
+ pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0);
+
+ return -1;
}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index cd8705822..d7d3d8edb 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1809,6 +1809,9 @@ static int restore_task_with_children(void *_arg)
goto err;
}
+ if (set_opts_cap_eff())
+ goto err;
+
/* Wait prepare_userns */
if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0)
goto err;
diff --git a/criu/cr-service.c b/criu/cr-service.c
index 1d9f0aca3..73c48f5a6 100644
--- a/criu/cr-service.c
+++ b/criu/cr-service.c
@@ -14,6 +14,7 @@
#include <sys/stat.h>
#include <arpa/inet.h>
#include <sched.h>
+#include <sys/prctl.h>
#include "version.h"
#include "crtools.h"
@@ -409,6 +410,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file);
}
+ if (req->has_unprivileged)
+ opts.unprivileged = req->unprivileged;
+
+ if (check_caps())
+ return 1;
+
if (kerndat_init())
return 1;
diff --git a/criu/crtools.c b/criu/crtools.c
index 8bcbe8e38..ac05bc821 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -185,6 +185,9 @@ int main(int argc, char *argv[], char *envp[])
return cr_service_work(atoi(argv[optind + 1]));
}
+ if (check_caps())
+ return 1;
+
if (opts.imgs_dir == NULL)
SET_CHAR_OPTS(imgs_dir, ".");
@@ -414,6 +417,8 @@ usage:
" --network-lock METHOD\n"
" network locking/unlocking method; argument\n"
" can be 'nftables' or 'iptables' (default).\n"
+ " --unprivileged accept limitations when running as non-root\n"
+ " consult documentation for further details\n"
"\n"
"* External resources support:\n"
" --external RES dump objects from this list as external resources:\n"
diff --git a/criu/fdstore.c b/criu/fdstore.c
index 6a7f73a59..03afa9f17 100644
--- a/criu/fdstore.c
+++ b/criu/fdstore.c
@@ -13,6 +13,8 @@
#include "rst-malloc.h"
#include "log.h"
#include "util.h"
+#include "cr_options.h"
+#include "util-caps.h"
/* clang-format off */
static struct fdstore_desc {
@@ -27,6 +29,8 @@ int fdstore_init(void)
uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 };
struct sockaddr_un addr;
unsigned int addrlen;
+ int rcv_opt_name;
+ int snd_opt_name;
struct stat st;
int sk, ret;
@@ -49,8 +53,16 @@ int fdstore_init(void)
return -1;
}
- if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 ||
- setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) {
+ if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) {
+ rcv_opt_name = SO_RCVBUFFORCE;
+ snd_opt_name = SO_SNDBUFFORCE;
+ } else {
+ rcv_opt_name = SO_RCVBUF;
+ snd_opt_name = SO_SNDBUF;
+ }
+
+ if (setsockopt(sk, SOL_SOCKET, snd_opt_name, &buf[0], sizeof(buf[0])) < 0 ||
+ setsockopt(sk, SOL_SOCKET, rcv_opt_name, &buf[1], sizeof(buf[1])) < 0) {
pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE");
close(sk);
return -1;
diff --git a/criu/files.c b/criu/files.c
index 8a2250e19..38dc076d2 100644
--- a/criu/files.c
+++ b/criu/files.c
@@ -21,7 +21,7 @@
#include "image.h"
#include "common/list.h"
#include "rst-malloc.h"
-#include "util-pie.h"
+#include "util-caps.h"
#include "common/lock.h"
#include "sockets.h"
#include "pstree.h"
@@ -1346,10 +1346,35 @@ static int fchroot(int fd)
return chroot(".");
}
+static int need_chroot(int saved_root)
+{
+ struct stat saved_root_stat, cur_root_stat;
+ int psd;
+
+ if (fstat(saved_root, &saved_root_stat) == -1) {
+ pr_perror("Failed to stat saved root dir");
+ return -1;
+ }
+
+ psd = open_pid_proc(PROC_SELF);
+ if (psd < 0) {
+ pr_perror("Failed to open PROC_SELF");
+ return -1;
+ }
+
+ if (fstatat(psd, "root", &cur_root_stat, 0) == -1) {
+ pr_perror("Failed to stat current root dir");
+ return -1;
+ }
+
+ return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev;
+}
+
int restore_fs(struct pstree_item *me)
{
int dd_root = -1, dd_cwd = -1, ret, err = -1;
struct rst_info *ri = rsti(me);
+ bool do_chroot = true;
/*
* First -- open both descriptors. We will not
@@ -1369,14 +1394,23 @@ int restore_fs(struct pstree_item *me)
}
/*
+ * In unprivileged mode chroot() may fail if we don't have
+ * sufficient privileges, therefore only do it if the process
+ * is actually chrooted.
+ */
+ if (opts.unprivileged)
+ do_chroot = need_chroot(dd_root);
+
+ /*
* Now do chroot/chdir. Chroot goes first as it calls chdir into
* dd_root so we'd need to fix chdir after it anyway.
*/
-
- ret = fchroot(dd_root);
- if (ret < 0) {
- pr_perror("Can't change root");
- goto out;
+ if (do_chroot) {
+ ret = fchroot(dd_root);
+ if (ret < 0) {
+ pr_perror("Can't change root");
+ goto out;
+ }
}
ret = fchdir(dd_cwd);
diff --git a/criu/image.c b/criu/image.c
index 353de48e8..3c2127ac6 100644
--- a/criu/image.c
+++ b/criu/image.c
@@ -226,7 +226,8 @@ int prepare_inventory(InventoryEntry *he)
if (get_task_ids(&crt.i))
return -1;
- he->has_root_cg_set = true;
+ if (!opts.unprivileged)
+ he->has_root_cg_set = true;
if (dump_task_cgroup(NULL, &he->root_cg_set, NULL))
return -1;
diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
index 6e85dff0a..eacaa03a6 100644
--- a/criu/include/cr_options.h
+++ b/criu/include/cr_options.h
@@ -2,6 +2,7 @@
#define __CR_OPTIONS_H__
#include <stdbool.h>
+#include <sys/capability.h>
#include "common/config.h"
#include "common/list.h"
#include "int.h"
@@ -223,8 +224,14 @@ struct cr_options {
* CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN
*/
uid_t uid;
- /* This contains the value from /proc/pid/status: CapEff */
- u32 cap_eff[CR_CAP_SIZE];
+ /* This contains the value from capget()->effective */
+ u32 cap_eff[_LINUX_CAPABILITY_U32S_3];
+ /*
+ * If CRIU should be running as non-root with the help of
+ * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should
+ * explicitly request it as it comes with many limitations.
+ */
+ int unprivileged;
};
extern struct cr_options opts;
diff --git a/criu/include/util.h b/criu/include/util.h
index 4e29c079e..3a0403113 100644
--- a/criu/include/util.h
+++ b/criu/include/util.h
@@ -386,6 +386,8 @@ extern int mount_detached_fs(const char *fsname);
extern char *get_legacy_iptables_bin(bool ipv6);
+extern int set_opts_cap_eff(void);
+
extern ssize_t read_all(int fd, void *buf, size_t size);
extern ssize_t write_all(int fd, const void *buf, size_t size);
diff --git a/criu/namespaces.c b/criu/namespaces.c
index 7356fe8c2..286073ff6 100644
--- a/criu/namespaces.c
+++ b/criu/namespaces.c
@@ -28,6 +28,7 @@
#include "cgroup.h"
#include "fdstore.h"
#include "kerndat.h"
+#include "util-caps.h"
#include "protobuf.h"
#include "util.h"
@@ -1623,10 +1624,12 @@ int collect_namespaces(bool for_dump)
int prepare_userns_creds(void)
{
- /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */
- if (setuid(0) || setgid(0) || setgroups(0, NULL)) {
- pr_perror("Unable to initialize id-s");
- return -1;
+ if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) {
+ /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */
+ if (setuid(0) || setgid(0) || setgroups(0, NULL)) {
+ pr_perror("Unable to initialize id-s");
+ return -1;
+ }
}
/*
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index f80b68359..0e98cb3da 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -184,7 +184,7 @@ static int lsm_set_label(char *label, char *type, int procfd)
return 0;
}
-static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type)
+static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid)
{
CredsEntry *ce = &args->creds;
int b, i, ret;
@@ -211,10 +211,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ
* lose caps bits when changing xids.
*/
- ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
- if (ret) {
- pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret);
- return -1;
+ if (!uid) {
+ ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret);
+ return -1;
+ }
}
/*
@@ -252,10 +254,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ
* special state any longer.
*/
- ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
- if (ret) {
- pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret);
- return -1;
+ if (!uid) {
+ ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret);
+ return -1;
+ }
}
/*
@@ -634,7 +638,7 @@ long __export_restore_thread(struct thread_restore_args *args)
if (restore_seccomp(args))
BUG();
- ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type);
+ ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid);
ret = ret || restore_dumpable_flag(&args->ta->mm);
ret = ret || restore_pdeath_sig(args);
if (ret)
@@ -1915,7 +1919,7 @@ long __export_restore_task(struct task_restore_args *args)
* turning off TCP repair is CAP_SYS_NED_ADMIN protected,
* thus restore* creds _after_ all of the above.
*/
- ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type);
+ ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid);
ret = ret || restore_dumpable_flag(&args->mm);
ret = ret || restore_pdeath_sig(args->t);
ret = ret || restore_child_subreaper(args->child_subreaper);
diff --git a/criu/timens.c b/criu/timens.c
index 5803fc359..66c0c02a4 100644
--- a/criu/timens.c
+++ b/criu/timens.c
@@ -5,6 +5,7 @@
#include "proc_parse.h"
#include "namespaces.h"
#include "timens.h"
+#include "cr_options.h"
#include "protobuf.h"
#include "images/timens.pb-c.h"
@@ -57,6 +58,9 @@ int prepare_timens(int id)
struct timespec ts;
struct timespec prev_moff = {}, prev_boff = {};
+ if (opts.unprivileged)
+ return 0;
+
img = open_image(CR_FD_TIMENS, O_RSTR, id);
if (!img)
return -1;
diff --git a/criu/util.c b/criu/util.c
index 060ca3bd4..b3b2b6659 100644
--- a/criu/util.c
+++ b/criu/util.c
@@ -41,6 +41,7 @@
#include "namespaces.h"
#include "criu-log.h"
#include "syscall.h"
+#include "util-caps.h"
#include "clone-noasan.h"
#include "cr_options.h"
@@ -1426,6 +1427,9 @@ void rlimit_unlimit_nofile(void)
{
struct rlimit new;
+ if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff))
+ return;
+
new.rlim_cur = kdat.sysctl_nr_open;
new.rlim_max = kdat.sysctl_nr_open;
@@ -2064,3 +2068,21 @@ out:
xfree(free_path);
return mp_path;
}
+
+int set_opts_cap_eff(void)
+{
+ struct __user_cap_header_struct cap_header;
+ struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3];
+ int i;
+
+ cap_header.version = _LINUX_CAPABILITY_VERSION_3;
+ cap_header.pid = getpid();
+
+ if (capget(&cap_header, &cap_data[0]))
+ return -1;
+
+ for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++)
+ memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32));
+
+ return 0;
+}
diff --git a/images/rpc.proto b/images/rpc.proto
index 3cf431639..afd2c7b43 100644
--- a/images/rpc.proto
+++ b/images/rpc.proto
@@ -139,6 +139,7 @@ message criu_opts {
optional criu_network_lock_method network_lock = 64 [default = IPTABLES];
optional bool mntns_compat_mode = 65;
optional bool skip_file_rwx_check = 66;
+ optional bool unprivileged = 67;
/* optional bool check_mounts = 128; */
}
diff --git a/lib/c/criu.c b/lib/c/criu.c
index 8171f7a12..fc8159999 100644
--- a/lib/c/criu.c
+++ b/lib/c/criu.c
@@ -566,6 +566,17 @@ void criu_set_skip_file_rwx_check(bool skip_file_rwx_check)
criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check);
}
+void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged)
+{
+ opts->rpc->has_unprivileged = true;
+ opts->rpc->unprivileged = unprivileged;
+}
+
+void criu_set_unprivileged(bool unprivileged)
+{
+ criu_local_set_unprivileged(global_opts, unprivileged);
+}
+
void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master)
{
opts->rpc->has_orphan_pts_master = true;
diff --git a/lib/c/criu.h b/lib/c/criu.h
index c32a8a646..28a083d88 100644
--- a/lib/c/criu.h
+++ b/lib/c/criu.h
@@ -79,6 +79,7 @@ void criu_set_weak_sysctls(bool val);
void criu_set_evasive_devices(bool evasive_devices);
void criu_set_shell_job(bool shell_job);
void criu_set_skip_file_rwx_check(bool skip_file_rwx_check);
+void criu_set_unprivileged(bool unprivileged);
void criu_set_orphan_pts_master(bool orphan_pts_master);
void criu_set_file_locks(bool file_locks);
void criu_set_track_mem(bool track_mem);