From 3b5f5c7d485964500215ba74b874195b60df85fd Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 12 Aug 2022 11:56:53 -0700 Subject: non-root: enable non-root checkpoint/restore This commit enables checkpointing and restoring of applications as non-root. First goal was to enable checkpoint and restore of the env00 and pthread00 test case. This uses the information from opts.unprivileged and opts.cap_eff to skip certain code paths which do not work as non-root. Co-authored-by: Adrian Reber Signed-off-by: Younes Manton --- criu/cgroup.c | 6 ++++ criu/config.c | 1 + criu/cr-check.c | 71 +++++++++++++++++++++++++++++------------------ criu/cr-restore.c | 3 ++ criu/cr-service.c | 7 +++++ criu/crtools.c | 5 ++++ criu/fdstore.c | 16 +++++++++-- criu/files.c | 46 ++++++++++++++++++++++++++---- criu/image.c | 3 +- criu/include/cr_options.h | 11 ++++++-- criu/include/util.h | 2 ++ criu/namespaces.c | 11 +++++--- criu/pie/restorer.c | 26 +++++++++-------- criu/timens.c | 4 +++ criu/util.c | 22 +++++++++++++++ images/rpc.proto | 1 + lib/c/criu.c | 11 ++++++++ lib/c/criu.h | 1 + 18 files changed, 194 insertions(+), 53 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 325df6a1d..d886ce9f2 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -734,6 +734,9 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ unsigned int n_ctls = 0; struct cg_set *cs; + if (opts.unprivileged) + return 0; + if (item) pid = item->pid->real; else @@ -989,6 +992,9 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; + if (opts.unprivileged) + return 0; + BUG_ON(!criu_cgset || !root_cgset); /* diff --git a/criu/config.c b/criu/config.c index c078848ec..9ba79c8ef 100644 --- a/criu/config.c +++ b/criu/config.c @@ -700,6 +700,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), + BOOL_OPT("unprivileged", &opts.unprivileged), {}, }; diff --git a/criu/cr-check.c b/criu/cr-check.c index b90e6a9bf..b54c79387 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "../soccr/soccr.h" @@ -515,6 +516,14 @@ static int check_ipc(void) { int ret; + /* + * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however + * for non-root users access() runs with an empty set of caps and will therefore always + * fail. + */ + if (opts.uid) + return 0; + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -1039,10 +1048,14 @@ static int check_tcp(void) } val = 1; - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + } else { + pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); @@ -1394,9 +1407,6 @@ int cr_check(void) struct ns_id *ns; int ret = 0; - if (!is_root_user()) - return -1; - root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1666,36 +1676,43 @@ static int pr_set_dumpable(int value) int check_caps(void) { - struct proc_status_creds creds; - int exit_code = -1; - - if (parse_pid_status(PROC_SELF, &creds.s, NULL)) + /* Read out effective capabilities and store in opts.cap_eff. */ + if (set_opts_cap_eff()) goto out; - memcpy(&opts.cap_eff, &creds.cap_eff, sizeof(u32) * PROC_CAP_SIZE); - + /* + * No matter if running as root or not. CRIU always needs + * at least these capabilities. + */ if (!has_cap_checkpoint_restore(opts.cap_eff)) goto out; /* For some things we need to know if we are running as root. */ opts.uid = geteuid(); - if (opts.uid) { - /* - * At his point we know we are running as non-root with the necessary - * capabilities available. Now we have to make the process dumpable - * so that /proc/self is not owned by root. - */ - if (pr_set_dumpable(1)) - return -1; + if (!opts.uid) { + /* CRIU is running as root. No further checks are necessary. */ + return 0; } - exit_code = 0; -out: - if (exit_code) { - pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); - pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + if (!opts.unprivileged) { + pr_msg("Running as non-root requires '--unprivileged'\n"); + pr_msg("Please consult the documentation for limitations when running as non-root\n"); + return -1; } - return exit_code; + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + + return 0; +out: + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + + return -1; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cd8705822..d7d3d8edb 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1809,6 +1809,9 @@ static int restore_task_with_children(void *_arg) goto err; } + if (set_opts_cap_eff()) + goto err; + /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; diff --git a/criu/cr-service.c b/criu/cr-service.c index 1d9f0aca3..73c48f5a6 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "version.h" #include "crtools.h" @@ -409,6 +410,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); } + if (req->has_unprivileged) + opts.unprivileged = req->unprivileged; + + if (check_caps()) + return 1; + if (kerndat_init()) return 1; diff --git a/criu/crtools.c b/criu/crtools.c index 8bcbe8e38..ac05bc821 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -185,6 +185,9 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[optind + 1])); } + if (check_caps()) + return 1; + if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -414,6 +417,8 @@ usage: " --network-lock METHOD\n" " network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" + " --unprivileged accept limitations when running as non-root\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/fdstore.c b/criu/fdstore.c index 6a7f73a59..03afa9f17 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -13,6 +13,8 @@ #include "rst-malloc.h" #include "log.h" #include "util.h" +#include "cr_options.h" +#include "util-caps.h" /* clang-format off */ static struct fdstore_desc { @@ -27,6 +29,8 @@ int fdstore_init(void) uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; struct sockaddr_un addr; unsigned int addrlen; + int rcv_opt_name; + int snd_opt_name; struct stat st; int sk, ret; @@ -49,8 +53,16 @@ int fdstore_init(void) return -1; } - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + rcv_opt_name = SO_RCVBUFFORCE; + snd_opt_name = SO_SNDBUFFORCE; + } else { + rcv_opt_name = SO_RCVBUF; + snd_opt_name = SO_SNDBUF; + } + + if (setsockopt(sk, SOL_SOCKET, snd_opt_name, &buf[0], sizeof(buf[0])) < 0 || + setsockopt(sk, SOL_SOCKET, rcv_opt_name, &buf[1], sizeof(buf[1])) < 0) { pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); close(sk); return -1; diff --git a/criu/files.c b/criu/files.c index 8a2250e19..38dc076d2 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-pie.h" +#include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -1346,10 +1346,35 @@ static int fchroot(int fd) return chroot("."); } +static int need_chroot(int saved_root) +{ + struct stat saved_root_stat, cur_root_stat; + int psd; + + if (fstat(saved_root, &saved_root_stat) == -1) { + pr_perror("Failed to stat saved root dir"); + return -1; + } + + psd = open_pid_proc(PROC_SELF); + if (psd < 0) { + pr_perror("Failed to open PROC_SELF"); + return -1; + } + + if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { + pr_perror("Failed to stat current root dir"); + return -1; + } + + return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; +} + int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); + bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1368,15 +1393,24 @@ int restore_fs(struct pstree_item *me) goto out; } + /* + * In unprivileged mode chroot() may fail if we don't have + * sufficient privileges, therefore only do it if the process + * is actually chrooted. + */ + if (opts.unprivileged) + do_chroot = need_chroot(dd_root); + /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; + if (do_chroot) { + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } } ret = fchdir(dd_cwd); diff --git a/criu/image.c b/criu/image.c index 353de48e8..3c2127ac6 100644 --- a/criu/image.c +++ b/criu/image.c @@ -226,7 +226,8 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - he->has_root_cg_set = true; + if (!opts.unprivileged) + he->has_root_cg_set = true; if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) return -1; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 6e85dff0a..eacaa03a6 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -2,6 +2,7 @@ #define __CR_OPTIONS_H__ #include +#include #include "common/config.h" #include "common/list.h" #include "int.h" @@ -223,8 +224,14 @@ struct cr_options { * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN */ uid_t uid; - /* This contains the value from /proc/pid/status: CapEff */ - u32 cap_eff[CR_CAP_SIZE]; + /* This contains the value from capget()->effective */ + u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; + /* + * If CRIU should be running as non-root with the help of + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should + * explicitly request it as it comes with many limitations. + */ + int unprivileged; }; extern struct cr_options opts; diff --git a/criu/include/util.h b/criu/include/util.h index 4e29c079e..3a0403113 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -386,6 +386,8 @@ extern int mount_detached_fs(const char *fsname); extern char *get_legacy_iptables_bin(bool ipv6); +extern int set_opts_cap_eff(void); + extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); diff --git a/criu/namespaces.c b/criu/namespaces.c index 7356fe8c2..286073ff6 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -28,6 +28,7 @@ #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" +#include "util-caps.h" #include "protobuf.h" #include "util.h" @@ -1623,10 +1624,12 @@ int collect_namespaces(bool for_dump) int prepare_userns_creds(void) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; + if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } } /* diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index f80b68359..0e98cb3da 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -184,7 +184,7 @@ static int lsm_set_label(char *label, char *type, int procfd) return 0; } -static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type) +static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid) { CredsEntry *ce = &args->creds; int b, i, ret; @@ -211,10 +211,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * lose caps bits when changing xids. */ - ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); - if (ret) { - pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; + } } /* @@ -252,10 +254,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * special state any longer. */ - ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); - if (ret) { - pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; + } } /* @@ -634,7 +638,7 @@ long __export_restore_thread(struct thread_restore_args *args) if (restore_seccomp(args)) BUG(); - ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -1915,7 +1919,7 @@ long __export_restore_task(struct task_restore_args *args) * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ - ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper); diff --git a/criu/timens.c b/criu/timens.c index 5803fc359..66c0c02a4 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -5,6 +5,7 @@ #include "proc_parse.h" #include "namespaces.h" #include "timens.h" +#include "cr_options.h" #include "protobuf.h" #include "images/timens.pb-c.h" @@ -57,6 +58,9 @@ int prepare_timens(int id) struct timespec ts; struct timespec prev_moff = {}, prev_boff = {}; + if (opts.unprivileged) + return 0; + img = open_image(CR_FD_TIMENS, O_RSTR, id); if (!img) return -1; diff --git a/criu/util.c b/criu/util.c index 060ca3bd4..b3b2b6659 100644 --- a/criu/util.c +++ b/criu/util.c @@ -41,6 +41,7 @@ #include "namespaces.h" #include "criu-log.h" #include "syscall.h" +#include "util-caps.h" #include "clone-noasan.h" #include "cr_options.h" @@ -1426,6 +1427,9 @@ void rlimit_unlimit_nofile(void) { struct rlimit new; + if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff)) + return; + new.rlim_cur = kdat.sysctl_nr_open; new.rlim_max = kdat.sysctl_nr_open; @@ -2064,3 +2068,21 @@ out: xfree(free_path); return mp_path; } + +int set_opts_cap_eff(void) +{ + struct __user_cap_header_struct cap_header; + struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3]; + int i; + + cap_header.version = _LINUX_CAPABILITY_VERSION_3; + cap_header.pid = getpid(); + + if (capget(&cap_header, &cap_data[0])) + return -1; + + for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) + memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32)); + + return 0; +} diff --git a/images/rpc.proto b/images/rpc.proto index 3cf431639..afd2c7b43 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -139,6 +139,7 @@ message criu_opts { optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; + optional bool unprivileged = 67; /* optional bool check_mounts = 128; */ } diff --git a/lib/c/criu.c b/lib/c/criu.c index 8171f7a12..fc8159999 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -566,6 +566,17 @@ void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); } +void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged) +{ + opts->rpc->has_unprivileged = true; + opts->rpc->unprivileged = unprivileged; +} + +void criu_set_unprivileged(bool unprivileged) +{ + criu_local_set_unprivileged(global_opts, unprivileged); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; diff --git a/lib/c/criu.h b/lib/c/criu.h index c32a8a646..28a083d88 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -79,6 +79,7 @@ void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); +void criu_set_unprivileged(bool unprivileged); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); -- cgit v1.2.3