#include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "files.h" #include "file-ids.h" #include "files-reg.h" #include "file-lock.h" #include "image.h" #include "common/list.h" #include "rst-malloc.h" #include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" #include "tty.h" #include "pipes.h" #include "fifo.h" #include "eventfd.h" #include "eventpoll.h" #include "fsnotify.h" #include "sk-packet.h" #include "mount.h" #include "signalfd.h" #include "memfd.h" #include "namespaces.h" #include "tun.h" #include "timerfd.h" #include "imgset.h" #include "fs-magic.h" #include "fdinfo.h" #include "cr_options.h" #include "autofs.h" #include "parasite.h" #include "parasite-syscall.h" #include "string.h" #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" #include "protobuf.h" #include "util.h" #include "images/fs.pb-c.h" #include "images/ext-file.pb-c.h" #include "plugin.h" #define FDESC_HASH_SIZE 64 static struct hlist_head file_desc_hash[FDESC_HASH_SIZE]; /* file_desc's, which fle is not owned by a process, that is able to open them */ static LIST_HEAD(fake_master_head); static u32 max_file_desc_id = 0; static void init_fdesc_hash(void) { int i; for (i = 0; i < FDESC_HASH_SIZE; i++) INIT_HLIST_HEAD(&file_desc_hash[i]); } void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops) { INIT_LIST_HEAD(&d->fd_info_head); INIT_LIST_HEAD(&d->fake_master_list); INIT_HLIST_NODE(&d->hash); d->id = id; d->ops = ops; } int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops) { file_desc_init(d, id, ops); hlist_add_head(&d->hash, &file_desc_hash[id % FDESC_HASH_SIZE]); if (id > max_file_desc_id) max_file_desc_id = id; return 0; /* this is to make tail-calls in collect_one_foo look nice */ } struct file_desc *find_file_desc_raw(int type, u32 id) { struct file_desc *d; struct hlist_head *chain; chain = &file_desc_hash[id % FDESC_HASH_SIZE]; hlist_for_each_entry(d, chain, hash) if ((d->id == id) && (d->ops->type == type || type == FD_TYPES__UND)) /* * Warning -- old CRIU might generate matching IDs * for different file types! So any code that uses * FD_TYPES__UND for fdesc search MUST make sure it's * dealing with the merged files images where all * descs are forced to have different IDs. */ return d; return NULL; } static inline struct file_desc *find_file_desc(FdinfoEntry *fe) { return find_file_desc_raw(fe->type, fe->id); } u32 find_unused_file_desc_id(void) { return max_file_desc_id + 1; } struct fdinfo_list_entry *find_used_fd(struct pstree_item *task, int fd) { struct list_head *head; struct fdinfo_list_entry *fle; head = &rsti(task)->fds; list_for_each_entry_reverse(fle, head, ps_list) { if (fle->fe->fd == fd) return fle; /* List is ordered, so let's stop */ if (fle->fe->fd < fd) break; } return NULL; } static void collect_task_fd(struct fdinfo_list_entry *new_fle, struct rst_info *ri) { struct fdinfo_list_entry *fle; /* * fles in fds list are ordered by fd. Fds are restored from img files * in ascending order, so it is faster to insert them from the end of * the list. */ list_for_each_entry_reverse(fle, &ri->fds, ps_list) { if (fle->fe->fd < new_fle->fe->fd) break; } list_add(&new_fle->ps_list, &fle->ps_list); } unsigned int find_unused_fd(struct pstree_item *task, int hint_fd) { struct list_head *head; struct fdinfo_list_entry *fle; int fd = 0, prev_fd; if ((hint_fd >= 0) && (!find_used_fd(task, hint_fd))) { fd = hint_fd; goto out; } prev_fd = service_fd_min_fd(task) - 1; head = &rsti(task)->fds; list_for_each_entry_reverse(fle, head, ps_list) { fd = fle->fe->fd; if (prev_fd > fd) { fd++; goto out; } prev_fd = fd - 1; } BUG(); out: return fd; } int find_unused_fd_pid(pid_t pid) { struct pstree_item *task; task = pstree_item_by_virt(pid); if (!task) { pr_err("Invalid pid:%d\n", pid); return -1; } return find_unused_fd(task, -1); } int set_fds_event(pid_t virt) { struct pstree_item *item; bool is_set; item = pstree_item_by_virt(virt); BUG_ON(!item); is_set = !!test_and_set_bit_le(FDS_EVENT_BIT, &item->task_st_le_bits); if (!is_set) futex_wake(&item->task_st); return 0; } void clear_fds_event(void) { clear_bit_le(FDS_EVENT_BIT, ¤t->task_st_le_bits); } void wait_fds_event(void) { futex_t *f = ¤t->task_st; int value; value = htole32(FDS_EVENT); futex_wait_if_cond(f, value, &); clear_fds_event(); } struct fdinfo_list_entry *try_file_master(struct file_desc *d) { if (list_empty(&d->fd_info_head)) return NULL; return list_first_entry(&d->fd_info_head, struct fdinfo_list_entry, desc_list); } struct fdinfo_list_entry *file_master(struct file_desc *d) { struct fdinfo_list_entry *fle; fle = try_file_master(d); if (!fle) { pr_err("Empty list on file desc id %#x(%d)\n", d->id, d->ops ? d->ops->type : -1); BUG(); } return fle; } void show_saved_files(void) { int i; struct file_desc *fd; pr_info("File descs:\n"); for (i = 0; i < FDESC_HASH_SIZE; i++) hlist_for_each_entry(fd, &file_desc_hash[i], hash) { struct fdinfo_list_entry *le; pr_info(" `- type %d ID %#x\n", fd->ops->type, fd->id); list_for_each_entry(le, &fd->fd_info_head, desc_list) pr_info(" `- FD %d pid %d\n", le->fe->fd, le->pid); } } /* * Workaround for the OverlayFS bug present before Kernel 4.2 * * This is here only to support the Linux Kernel between versions * 3.18 and 4.2. After that, this workaround is not needed anymore, * but it will work properly on both a kernel with and without the bug. * * When a process has a file open in an OverlayFS directory, * the information in /proc//fd/ and /proc//fdinfo/ * is wrong. We can't even rely on stat()-ing /proc//fd/ since * this will show us the wrong filesystem type. * * So we grab that information from the mountinfo table instead. This is done * every time fill_fdlink is called. See lookup_overlayfs for more details. * */ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) { struct mount_info *m; if (!link) return 0; m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id); if (IS_ERR(m)) return -1; if (!m) return 0; p->mnt_id = m->mnt_id; /* * If the bug is present, the file path from /proc//fd * does not include the mountpoint, so we prepend it ourselves. */ if (strcmp("./", m->ns_mountpoint) != 0) { char buf[PATH_MAX]; int n; strlcpy(buf, link->name, PATH_MAX); n = snprintf(link->name, PATH_MAX, "%s/%s", m->ns_mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); return -1; } } return 0; } /* * The gen_id thing is used to optimize the comparison of shared files. * If two files have different gen_ids, then they are different for sure. * If it matches, we don't know it and have to call sys_kcmp(). * * The kcmp-ids.c engine does this trick, see comments in it for more info. */ uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos) { uint32_t pos_hi = pos >> 32; uint32_t pos_low = pos & 0xffffffff; return st_dev ^ st_ino ^ pos_hi ^ pos_low; } int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e) { int ret = -1; e->type = ops->type; e->id = make_gen_id((uint32_t)p->stat.st_dev, (uint32_t)p->stat.st_ino, (uint64_t)p->pos); e->fd = p->fd; e->flags = p->fd_flags; ret = fd_id_generate(p->pid, e, p); if (ret == 1) /* new ID generated */ ret = ops->dump(lfd, e->id, p); else /* Remove locks generated by the fd before going to the next */ discard_dup_locks_tail(p->pid, e->fd); return ret; } int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link) { int len; link->name[0] = '.'; len = read_fd_link(lfd, &link->name[1], sizeof(link->name) - 1); if (len < 0) { pr_err("Can't read link for pid %d fd %d\n", p->pid, p->fd); return -1; } link->len = len + 1; if (opts.overlayfs) if (fixup_overlayfs((struct fd_parms *)p, link) < 0) return -1; return 0; } static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, struct fd_opts *opts, struct fd_parms *p) { int ret; struct statfs fsbuf; struct fdinfo_common fdinfo = { .mnt_id = -1, .owner = owner_pid->ns[0].virt }; if (fstat(lfd, &p->stat) < 0) { pr_perror("Can't stat fd %d", lfd); return -1; } if (fstatfs(lfd, &fsbuf) < 0) { pr_perror("Can't statfs fd %d", lfd); return -1; } if (parse_fdinfo_pid(owner_pid->real, fd, FD_TYPES__UND, &fdinfo)) return -1; p->fs_type = fsbuf.f_type; p->fd = fd; p->pos = fdinfo.pos; /* * The kernel artificially adds the O_CLOEXEC flag on the file pointer * flags by looking at the flags on the file descriptor (see kernel * code fs/proc/fd.c). FD_CLOEXEC is a file descriptor property, which * is saved in fd_flags. */ p->flags = fdinfo.flags & ~O_CLOEXEC; p->mnt_id = fdinfo.mnt_id; p->pid = owner_pid->real; p->fd_flags = opts->flags; fown_entry__init(&p->fown); pr_info("%d fdinfo %d: pos: %#16" PRIx64 " flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags); if (p->flags & O_PATH) ret = 0; else ret = fcntl(lfd, F_GETSIG, 0); if (ret < 0) { pr_perror("Can't get owner signum on %d", lfd); return -1; } p->fown.signum = ret; if (opts->fown.pid == 0) return 0; p->fown.pid = opts->fown.pid; p->fown.pid_type = opts->fown.pid_type; p->fown.uid = opts->fown.uid; p->fown.euid = opts->fown.euid; return 0; } static const struct fdtype_ops *get_misc_dev_ops(int minor) { switch (minor) { case TUN_MINOR: return &tunfile_dump_ops; case AUTOFS_MINOR: return ®file_dump_ops; }; return NULL; } static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) { const struct fdtype_ops *ops = NULL; /* * If /dev/kmsg is opened in write-only mode the file position * should not be set up upon restore, kernel doesn't allow that. */ if (minor == 11 && (p->flags & O_ACCMODE) == O_WRONLY && p->pos == 0) p->pos = -1ULL; ops = ®file_dump_ops; return ops; } static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) { struct fd_link *link_old = p->link; int maj = major(p->stat.st_rdev); const struct fdtype_ops *ops; struct fd_link link; int err; switch (maj) { case MEM_MAJOR: ops = get_mem_dev_ops(p, minor(p->stat.st_rdev)); break; case MISC_MAJOR: ops = get_misc_dev_ops(minor(p->stat.st_rdev)); if (ops) break; /* fallthrough */ default: { char more[32]; if (is_tty(p->stat.st_rdev, p->stat.st_dev)) { if (fill_fdlink(lfd, p, &link)) return -1; p->link = &link; ops = &tty_dump_ops; break; } sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); err = dump_unsupp_fd(p, lfd, "chr", more, e); p->link = link_old; return err; } } err = do_dump_gen_file(p, lfd, ops, e); p->link = link_old; return err; } static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl, FdinfoEntry *e, struct parasite_drain_fd *dfds) { struct fd_parms p = FD_PARMS_INIT; const struct fdtype_ops *ops; struct fd_link link; if (fill_fd_params(pid, fd, lfd, opts, &p) < 0) { pr_err("Can't get stat on %d\n", fd); return -1; } if (note_file_lock(pid, fd, lfd, &p)) return -1; /* Lease can be set only on regular file */ if (S_ISREG(p.stat.st_mode)) { int ret = correct_file_leases_type(pid, fd, lfd); if (ret < 0) return ret; } p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ p.dfds = dfds; /* epoll needs to verify if target fd exist */ if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e); if (S_ISCHR(p.stat.st_mode)) return dump_chrdev(&p, lfd, e); if (p.fs_type == ANON_INODE_FS_MAGIC) { char link[32]; if (read_fd_link(lfd, link, sizeof(link)) < 0) return -1; if (is_eventfd_link(link)) ops = &eventfd_dump_ops; else if (is_eventpoll_link(link)) ops = &eventpoll_dump_ops; else if (is_inotify_link(link)) ops = &inotify_dump_ops; else if (is_fanotify_link(link)) ops = &fanotify_dump_ops; else if (is_signalfd_link(link)) ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; #endif else return dump_unsupp_fd(&p, lfd, "anon", link, e); return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; p.link = &link; /* TODO: Dump for hugetlb fd when memfd hugetlb is not supported */ if (is_memfd(p.stat.st_dev) || (kdat.has_memfd_hugetlb && is_hugetlb_dev(p.stat.st_dev, NULL))) ops = &memfd_dump_ops; else if (link.name[1] == '/') ops = ®file_dump_ops; else if (check_ns_proc(&link)) ops = &nsfile_dump_ops; else return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISFIFO(p.stat.st_mode)) { if (p.fs_type == PIPEFS_MAGIC) ops = &pipe_dump_ops; else ops = &fifo_dump_ops; return do_dump_gen_file(&p, lfd, ops, e); } /* * For debug purpose -- at least show the link * file pointing to when reporting unsupported file. * On error simply empty string here. */ if (fill_fdlink(lfd, &p, &link)) memzero(&link, sizeof(link)); return dump_unsupp_fd(&p, lfd, "unknown", link.name + 1, e); } int dump_my_file(int lfd, u32 *id, int *type) { struct pid me = {}; struct fd_opts fdo = {}; FdinfoEntry e = FDINFO_ENTRY__INIT; me.real = getpid(); me.ns[0].virt = -1; /* FIXME */ if (dump_one_file(&me, lfd, lfd, &fdo, NULL, &e, NULL)) return -1; *id = e.id; *type = e.type; return 0; } int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) { int *lfds = NULL; struct cr_img *img = NULL; struct fd_opts *opts = NULL; int i, ret = -1; int off, nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds); pr_info("\n"); pr_info("Dumping opened files (pid: %d)\n", item->pid->real); pr_info("----------------------------------------\n"); lfds = xmalloc(nr_fds * sizeof(int)); if (!lfds) goto err; opts = xmalloc(nr_fds * sizeof(struct fd_opts)); if (!opts) goto err; img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); if (!img) goto err; ret = 0; /* Don't fail if nr_fds == 0 */ for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { if (nr_fds + off > dfds->nr_fds) nr_fds = dfds->nr_fds - off; ret = parasite_drain_fds_seized(ctl, dfds, nr_fds, off, lfds, opts); if (ret) goto err; for (i = 0; i < nr_fds; i++) { FdinfoEntry e = FDINFO_ENTRY__INIT; ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); if (ret) break; ret = pb_write_one(img, &e, PB_FDINFO); if (ret) break; } for (i = 0; i < nr_fds; i++) close(lfds[i]); } pr_info("----------------------------------------\n"); err: if (img) close_image(img); xfree(opts); xfree(lfds); return ret; } static int predump_one_fd(int pid, int fd) { const struct fdtype_ops *ops; char link[PATH_MAX], t[32]; int ret = 0; snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); ret = readlink(t, link, sizeof(link)); if (ret < 0) { pr_perror("Can't read link of fd %d", fd); return -1; } else if ((size_t)ret == sizeof(link)) { pr_err("Buffer for read link of fd %d is too small\n", fd); return -1; } link[ret] = 0; ret = 0; if (is_inotify_link(link)) ops = &inotify_dump_ops; else if (is_fanotify_link(link)) ops = &fanotify_dump_ops; else goto out; pr_debug("Pre-dumping %d's %d fd\n", pid, fd); ret = ops->pre_dump(pid, fd); out: return ret; } int predump_task_files(int pid) { struct dirent *de; DIR *fd_dir; int ret = -1; pr_info("Pre-dump fds for %d)\n", pid); fd_dir = opendir_proc(pid, "fd"); if (!fd_dir) return -1; while ((de = readdir(fd_dir))) { if (dir_dots(de)) continue; if (predump_one_fd(pid, atoi(de->d_name))) goto out; } ret = 0; out: closedir(fd_dir); return ret; } int restore_fown(int fd, FownEntry *fown) { struct f_owner_ex owner; uid_t uids[3]; if (fown->signum) { if (fcntl(fd, F_SETSIG, fown->signum)) { pr_perror("Can't set signal"); return -1; } } /* May be untouched */ if (!fown->pid) return 0; if (getresuid(&uids[0], &uids[1], &uids[2])) { pr_perror("Can't get current UIDs"); return -1; } if (setresuid(fown->uid, fown->euid, uids[2])) { pr_perror("Can't set UIDs"); return -1; } owner.type = fown->pid_type; owner.pid = fown->pid; if (fcntl(fd, F_SETOWN_EX, &owner)) { pr_perror("Can't setup %d file owner pid", fd); return -1; } if (setresuid(uids[0], uids[1], uids[2])) { pr_perror("Can't revert UIDs back"); return -1; } if (prctl(PR_SET_DUMPABLE, 1, 0)) pr_perror("Unable to set PR_SET_DUMPABLE"); return 0; } int rst_file_params(int fd, FownEntry *fown, int flags) { if (set_fd_flags(fd, flags) < 0) return -1; if (restore_fown(fd, fown) < 0) return -1; return 0; } static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) { struct fdinfo_list_entry *fle; fle = shmalloc(sizeof(*fle)); if (!fle) return NULL; fle->pid = pid; fle->fe = fe; fle->received = 0; fle->fake = 0; fle->stage = FLE_INITIALIZED; fle->task = pstree_item_by_virt(pid); if (!fle->task) { pr_err("Can't find task with pid %d\n", pid); shfree_last(fle); return NULL; } return fle; } static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) { struct fdinfo_list_entry *le; list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) if (pid_rst_prio_eq(le->pid, new_le->pid)) break; list_add(&new_le->desc_list, &le->desc_list); } static void collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc, bool force_master) { new_le->desc = fdesc; if (!force_master) __collect_desc_fle(new_le, fdesc); else { /* Link as first entry */ list_add(&new_le->desc_list, &fdesc->fd_info_head); } } struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, struct rst_info *rst_info, struct file_desc *fdesc, bool fake, bool force_master) { struct fdinfo_list_entry *new_le; new_le = alloc_fle(pid, e); if (new_le) { new_le->fake = (!!fake); collect_desc_fle(new_le, fdesc, force_master); collect_task_fd(new_le, rst_info); } return new_le; } int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) { struct file_desc *fdesc; pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", pid, e->fd, e->id); fdesc = find_file_desc(e); if (fdesc == NULL) { pr_err("No file for fd %d id %#x\n", e->fd, e->id); return -1; } if (!collect_fd_to(pid, e, rst_info, fdesc, fake, false)) return -1; return 0; } FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags) { FdinfoEntry *e; e = shmalloc(sizeof(*e)); if (!e) return NULL; fdinfo_entry__init(e); e->id = old->id; e->type = old->type; e->fd = fd; e->flags = flags; return e; } int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, int fd, unsigned flags) { FdinfoEntry *e; e = dup_fdinfo(ple->fe, fd, flags); if (!e) return -1; return collect_fd(vpid(task), e, rsti(task), false); } int prepare_fd_pid(struct pstree_item *item) { int ret = 0; struct cr_img *img; pid_t pid = vpid(item); struct rst_info *rst_info = rsti(item); INIT_LIST_HEAD(&rst_info->fds); if (item->ids == NULL) /* zombie */ return 0; if (rsti(item)->fdt && rsti(item)->fdt->pid != vpid(item)) return 0; img = open_image(CR_FD_FDINFO, O_RSTR, item->ids->files_id); if (!img) return -1; while (1) { FdinfoEntry *e; ret = pb_read_one_eof(img, &e, PB_FDINFO); if (ret <= 0) break; if (e->fd >= kdat.sysctl_nr_open) { ret = -1; pr_err("Too big FD number to restore %d\n", e->fd); break; } ret = collect_fd(pid, e, rst_info, false); if (ret < 0) { fdinfo_entry__free_unpacked(e, NULL); break; } } close_image(img); return ret; } #define SETFL_MASK (O_APPEND | O_ASYNC | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) int set_fd_flags(int fd, int flags) { int ret; ret = fcntl(fd, F_GETFL, 0); if (ret < 0) goto err; flags = (SETFL_MASK & flags) | (ret & ~SETFL_MASK); ret = fcntl(fd, F_SETFL, flags); if (ret < 0) goto err; /* Let's check, that now actual flags contains those we need */ ret = fcntl(fd, F_GETFL, 0); if (ret < 0) goto err; if (ret != flags) { pr_err("fcntl call on fd %d (flags %#o) succeeded, " "but some flags were dropped: %#o\n", fd, flags, ret); return -1; } return 0; err: pr_perror("fcntl call on fd %d (flags %x) failed", fd, flags); return -1; } struct fd_open_state { char *name; int (*cb)(int, struct fdinfo_list_entry *); }; static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%" PRIx64, pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } static bool task_fle(struct pstree_item *task, struct fdinfo_list_entry *fle) { struct fdinfo_list_entry *tmp; list_for_each_entry(tmp, &rsti(task)->fds, ps_list) if (fle == tmp) return true; return false; } static int plant_fd(struct fdinfo_list_entry *fle, int fd) { BUG_ON(fle->received); fle->received = 1; return reopen_fd_as(fle->fe->fd, fd); } static int recv_fd_from_peer(struct fdinfo_list_entry *fle) { struct fdinfo_list_entry *tmp; int fd, ret, tsock; if (fle->received) return 0; tsock = get_service_fd(TRANSPORT_FD_OFF); do { ret = __recv_fds(tsock, &fd, 1, (void *)&tmp, sizeof(struct fdinfo_list_entry *), MSG_DONTWAIT); if (ret == -EAGAIN || ret == -EWOULDBLOCK) return 1; else if (ret) return -1; pr_info("Further fle=%p, pid=%d\n", tmp, fle->pid); if (!task_fle(current, tmp)) { pr_err("Unexpected fle %p, pid=%d\n", tmp, vpid(current)); return -1; } if (plant_fd(tmp, fd)) return -1; } while (tmp != fle); return 0; } static int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle) { struct sockaddr_un saddr; int len, sock, ret; sock = get_service_fd(TRANSPORT_FD_OFF); transport_name_gen(&saddr, &len, fle->pid); pr_info("\t\tSend fd %d to %s\n", fd, saddr.sun_path + 1); ret = send_fds(sock, &saddr, len, &fd, 1, (void *)&fle, sizeof(struct fdinfo_list_entry *)); if (ret < 0) return -1; return set_fds_event(fle->pid); } /* * Helpers to scatter file_desc across users for those files, that * create two descriptors from a single system call at once (e.g. * ... or better i.e. -- pipes, socketpairs and ttys) */ int recv_desc_from_peer(struct file_desc *d, int *fd) { struct fdinfo_list_entry *fle; fle = file_master(d); *fd = fle->fe->fd; return recv_fd_from_peer(fle); } int send_desc_to_peer(int fd, struct file_desc *d) { return send_fd_to_peer(fd, file_master(d)); } static int send_fd_to_self(int fd, struct fdinfo_list_entry *fle) { int dfd = fle->fe->fd; if (fd == dfd) return 0; BUG_ON(dfd == get_service_fd(TRANSPORT_FD_OFF)); pr_info("\t\t\tGoing to dup %d into %d\n", fd, dfd); if (dup2(fd, dfd) != dfd) { pr_perror("Can't dup local fd %d -> %d", fd, dfd); return -1; } if (fcntl(dfd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } fle->received = 1; return 0; } static int serve_out_fd(int pid, int fd, struct file_desc *d) { int ret; struct fdinfo_list_entry *fle; pr_info("\t\tCreate fd for %d\n", fd); list_for_each_entry(fle, &d->fd_info_head, desc_list) { if (pid == fle->pid) ret = send_fd_to_self(fd, fle); else ret = send_fd_to_peer(fd, fle); if (ret) { pr_err("Can't sent fd %d to %d\n", fd, fle->pid); goto out; } } ret = 0; out: return ret; } int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) { struct file_desc *d = fle->desc; pid_t pid = fle->pid; if (reopen_fd_as(fle->fe->fd, new_fd)) return -1; if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } BUG_ON(fle->stage != FLE_INITIALIZED); fle->stage = FLE_OPEN; if (serve_out_fd(pid, fle->fe->fd, d)) return -1; return 0; } static int open_fd(struct fdinfo_list_entry *fle) { struct file_desc *d = fle->desc; struct fdinfo_list_entry *fle_m; int new_fd = -1, ret; fle_m = file_master(d); if (fle != fle_m) { BUG_ON(fle->stage != FLE_INITIALIZED); ret = receive_fd(fle); if (ret != 0) return ret; goto out; } /* * Open method returns the following values: * 0 -- restore is successfully finished; * 1 -- restore is in process or can't be started * yet, because of it depends on another fles, * so the method should be called once again; * -1 -- restore failed. * In case of 0 and 1 return values, new_fd may * be not negative. In this case it contains newly * opened file descriptor, which may be served out. * For every fle, new_fd is populated only once. * See setup_and_serve_out() BUG_ON for the details. */ ret = d->ops->open(d, &new_fd); if (ret != -1 && new_fd >= 0) { if (setup_and_serve_out(fle, new_fd) < 0) return -1; } out: if (ret == 0) fle->stage = FLE_RESTORED; return ret; } static int receive_fd(struct fdinfo_list_entry *fle) { int ret; pr_info("\tReceive fd for %d\n", fle->fe->fd); ret = recv_fd_from_peer(fle); if (ret != 0) { if (ret != 1) pr_err("Can't get fd=%d, pid=%d\n", fle->fe->fd, fle->pid); return ret; } if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { pr_perror("Unable to set file descriptor flags"); return -1; } return 0; } static void close_fdinfos(struct list_head *list) { struct fdinfo_list_entry *fle; list_for_each_entry(fle, list, ps_list) close(fle->fe->fd); } static int open_fdinfos(struct pstree_item *me) { struct list_head *list = &rsti(me)->fds; struct fdinfo_list_entry *fle, *tmp; LIST_HEAD(completed); LIST_HEAD(fake); bool progress, again; int st, ret = 0; do { progress = again = false; clear_fds_event(); list_for_each_entry_safe(fle, tmp, list, ps_list) { st = fle->stage; BUG_ON(st == FLE_RESTORED); ret = open_fd(fle); if (ret == -1) { pr_err("Unable to open fd=%d id=%#x\n", fle->fe->fd, fle->fe->id); goto splice; } if (st != fle->stage || ret == 0) progress = true; if (ret == 0) { /* * We delete restored items from fds list, * so open() methods may base on this feature * and reduce number of fles in their checks. */ list_del(&fle->ps_list); if (!fle->fake) list_add(&fle->ps_list, &completed); else list_add(&fle->ps_list, &fake); } if (ret == 1) again = true; } if (!progress && again) wait_fds_event(); } while (again || progress); BUG_ON(!list_empty(list)); /* * Fake fles may be used for restore other * file types, so their closing is delayed. */ close_fdinfos(&fake); splice: list_splice(&fake, list); list_splice(&completed, list); return ret; } int close_old_fds(void) { DIR *dir; struct dirent *de; int fd, ret; /** * Close previous /proc/self/ service fd, as we don't want to reuse it * from a different task. Also there can be some junk fd in it's place * after we've moved our service fds (e.g. from other task of parents * shared fdtable), we need to close it before opendir_proc() below. */ __close_service_fd(PROC_SELF_FD_OFF); dir = opendir_proc(PROC_SELF, "fd"); if (dir == NULL) return -1; while ((de = readdir(dir))) { if (dir_dots(de)) continue; ret = sscanf(de->d_name, "%d", &fd); if (ret != 1) { pr_err("Can't parse %s\n", de->d_name); closedir(dir); close_pid_proc(); return -1; } if ((!is_any_service_fd(fd)) && (dirfd(dir) != fd)) close_safe(&fd); } closedir(dir); close_pid_proc(); return 0; } int prepare_fds(struct pstree_item *me) { u32 ret = 0; pr_info("Opening fdinfo-s\n"); /* * This must be done after forking to allow child * to get the cgroup fd so it can move into the * correct /tasks file if it is in a different cgroup * set than its parent */ sfds_protected = false; close_service_fd(CGROUP_YARD); sfds_protected = true; if (rsti(me)->fdt) { struct fdt *fdt = rsti(me)->fdt; /* * Wait all tasks, who share a current fd table. * We should be sure, that nobody use any file * descriptor while fdtable is being restored. */ futex_inc_and_wake(&fdt->fdt_lock); futex_wait_while_lt(&fdt->fdt_lock, fdt->nr); if (fdt->pid != vpid(me)) { pr_info("File descriptor table is shared with %d\n", fdt->pid); futex_wait_until(&fdt->fdt_lock, fdt->nr + 1); goto out; } } BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) futex_inc_and_wake(&rsti(me)->fdt->fdt_lock); out: return ret; } static int fchroot(int fd) { /* * There's no such thing in syscalls. We can emulate * it using fchdir() */ if (fchdir(fd) < 0) { pr_perror("Can't chdir to proc"); return -1; } pr_debug("Going to chroot into /proc/self/fd/%d\n", fd); return chroot("."); } static int need_chroot(int saved_root) { struct stat saved_root_stat, cur_root_stat; int psd; if (fstat(saved_root, &saved_root_stat) == -1) { pr_perror("Failed to stat saved root dir"); return -1; } psd = open_pid_proc(PROC_SELF); if (psd < 0) { pr_perror("Failed to open PROC_SELF"); return -1; } if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { pr_perror("Failed to stat current root dir"); return -1; } return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; } int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); bool do_chroot = true; /* * First -- open both descriptors. We will not * be able to open the cwd one after we chroot. */ dd_root = open_reg_fd(ri->root); if (dd_root < 0) { pr_err("Can't open root\n"); goto out; } dd_cwd = open_reg_fd(ri->cwd); if (dd_cwd < 0) { pr_err("Can't open cwd\n"); goto out; } /* * In unprivileged mode chroot() may fail if we don't have * sufficient privileges, therefore only do it if the process * is actually chrooted. */ if (opts.unprivileged) do_chroot = need_chroot(dd_root); /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ if (do_chroot) { ret = fchroot(dd_root); if (ret < 0) { pr_perror("Can't change root"); goto out; } } ret = fchdir(dd_cwd); if (ret < 0) { pr_perror("Can't change cwd"); goto out; } if (ri->has_umask) { pr_info("Restoring umask to %o\n", ri->umask); umask(ri->umask); } err = 0; out: if (dd_cwd >= 0) close(dd_cwd); if (dd_root >= 0) close(dd_root); return err; } int prepare_fs_pid(struct pstree_item *item) { pid_t pid = vpid(item); struct rst_info *ri = rsti(item); struct cr_img *img; FsEntry *fe; int ret = -1; img = open_image(CR_FD_FS, O_RSTR, pid); if (!img) goto out; ret = pb_read_one_eof(img, &fe, PB_FS); close_image(img); if (ret <= 0) goto out; ri->cwd = collect_special_file(fe->cwd_id); if (!ri->cwd) { pr_err("Can't find task cwd file\n"); goto out_f; } ri->root = collect_special_file(fe->root_id); if (!ri->root) { pr_err("Can't find task root file\n"); goto out_f; } ri->has_umask = fe->has_umask; ri->umask = fe->umask; ret = 0; out_f: fs_entry__free_unpacked(fe, NULL); out: return ret; } int shared_fdt_prepare(struct pstree_item *item) { struct pstree_item *parent = item->parent; struct fdt *fdt; if (!rsti(parent)->fdt) { fdt = shmalloc(sizeof(*rsti(item)->fdt)); if (fdt == NULL) return -1; rsti(parent)->fdt = fdt; futex_init(&fdt->fdt_lock); fdt->nr = 1; fdt->pid = vpid(parent); } else fdt = rsti(parent)->fdt; rsti(item)->fdt = fdt; rsti(item)->service_fd_id = fdt->nr; fdt->nr++; return 0; } /* * Inherit fd support. * * There are cases where a process's file descriptor cannot be restored * from the checkpointed image. For example, a pipe file descriptor with * one end in the checkpointed process and the other end in a separate * process (that was not part of the checkpointed process tree) cannot be * restored because after checkpoint the pipe would be broken and removed. * * There are also cases where the user wants to use a new file during * restore instead of the original file in the checkpointed image. For * example, the user wants to change the log file of a process from * /path/to/oldlog to /path/to/newlog. * * In these cases, criu's caller should set up a new file descriptor to be * inherited by the restored process and specify it with the --inherit-fd * command line option. The argument of --inherit-fd has the format * fd[%d]:%s, where %d tells criu which of its own file descriptor to use * for restoring file identified by %s. * * As a debugging aid, if the argument has the format debug[%d]:%s, it tells * criu to write out the string after colon to the file descriptor %d. This * can be used to leave a "restore marker" in the output stream of the process. * * It's important to note that inherit fd support breaks applications * that depend on the state of the file descriptor being inherited. So, * consider inherit fd only for specific use cases that you know for sure * won't break the application. * * For examples please visit http://criu.org/Category:HOWTO. */ struct inherit_fd { struct list_head inh_list; char *inh_id; /* file identifier */ int inh_fd; /* criu's descriptor to inherit */ int inh_fd_id; }; int inh_fd_max = -1; int inherit_fd_parse(char *optarg) { char *cp = NULL; int n = -1; int fd = -1; int dbg = 0; /* * Parse the argument. */ if (!strncmp(optarg, "fd", 2)) cp = &optarg[2]; else if (!strncmp(optarg, "debug", 5)) { cp = &optarg[5]; dbg = 1; } if (cp) { n = sscanf(cp, "[%d]:", &fd); cp = strchr(optarg, ':'); } if (n != 1 || fd < 0 || !cp || !cp[1]) { pr_err("Invalid inherit fd argument: %s\n", optarg); return -1; } /* * If the argument is a debug string, write it to fd. * Otherwise, add it to the inherit fd list. */ cp++; if (dbg) { n = strlen(cp); if (write(fd, cp, n) != n) { pr_err("Can't write debug message %s to inherit fd %d\n", cp, fd); return -1; } return 0; } return inherit_fd_add(fd, cp); } int inherit_fd_add(int fd, char *key) { struct inherit_fd *inh; struct stat sbuf; if (fstat(fd, &sbuf) == -1) { pr_perror("Can't fstat inherit fd %d", fd); return -1; } inh = xmalloc(sizeof *inh); if (inh == NULL) return -1; if (fd > inh_fd_max) inh_fd_max = fd; inh->inh_id = xstrdup(key); if (inh->inh_id == NULL) { xfree(inh); return -1; } inh->inh_fd = fd; list_add_tail(&inh->inh_list, &opts.inherit_fds); return 0; } /* * Log the inherit fd list. Called for diagnostics purposes * after the log file is initialized. */ void inherit_fd_log(void) { struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { pr_info("File %s will be restored from inherit fd %d\n", inh->inh_id, inh->inh_fd); } } int inherit_fd_move_to_fdstore(void) { struct inherit_fd *inh; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { inh->inh_fd_id = fdstore_add(inh->inh_fd); if (inh->inh_fd_id < 0) return -1; close_safe(&inh->inh_fd); } return 0; } /* * Look up the inherit fd list by a file identifier. */ int inherit_fd_lookup_id(char *id) { int ret; struct inherit_fd *inh; ret = -1; list_for_each_entry(inh, &opts.inherit_fds, inh_list) { if (!strcmp(inh->inh_id, id)) { ret = fdstore_get(inh->inh_fd_id); pr_debug("Found id %s (fd %d) in inherit fd list\n", id, ret); break; } } return ret; } bool inherited_fd(struct file_desc *d, int *fd_p) { char buf[PATH_MAX], *id_str; int i_fd; if (!d->ops->name) return false; id_str = d->ops->name(d, buf, sizeof(buf)); i_fd = inherit_fd_lookup_id(id_str); if (i_fd < 0) return false; if (fd_p == NULL) return true; *fd_p = i_fd; pr_info("File %s will be restored from fd %d dumped " "from inherit fd %d\n", id_str, *fd_p, i_fd); return true; } int open_transport_socket(void) { pid_t pid = vpid(current); struct sockaddr_un saddr; int sock, slen, ret = -1; sock = socket(PF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); if (sock < 0) { pr_perror("Can't create socket"); goto out; } transport_name_gen(&saddr, &slen, pid); if (bind(sock, (struct sockaddr *)&saddr, slen) < 0) { pr_perror("Can't bind transport socket %s", saddr.sun_path + 1); close(sock); goto out; } if (install_service_fd(TRANSPORT_FD_OFF, sock) < 0) goto out; ret = 0; out: return ret; } static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, struct collect_image_info *cinfo) { if (fe->id != id) { pr_err("ID mismatch %u != %u\n", fe->id, id); return -1; } return collect_entry(base, cinfo); } static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) { int ret = 0; FileEntry *fe; fe = pb_msg(base, FileEntry); switch (fe->type) { default: pr_err("Unknown file type %d\n", fe->type); return -1; case FD_TYPES__REG: ret = collect_one_file_entry(fe, fe->reg->id, &fe->reg->base, ®_file_cinfo); break; case FD_TYPES__INETSK: ret = collect_one_file_entry(fe, fe->isk->id, &fe->isk->base, &inet_sk_cinfo); break; case FD_TYPES__NS: ret = collect_one_file_entry(fe, fe->nsf->id, &fe->nsf->base, &nsfile_cinfo); break; case FD_TYPES__PACKETSK: ret = collect_one_file_entry(fe, fe->psk->id, &fe->psk->base, &packet_sk_cinfo); break; case FD_TYPES__NETLINKSK: ret = collect_one_file_entry(fe, fe->nlsk->id, &fe->nlsk->base, &netlink_sk_cinfo); break; case FD_TYPES__EVENTFD: ret = collect_one_file_entry(fe, fe->efd->id, &fe->efd->base, &eventfd_cinfo); break; case FD_TYPES__EVENTPOLL: ret = collect_one_file_entry(fe, fe->epfd->id, &fe->epfd->base, &epoll_cinfo); break; case FD_TYPES__SIGNALFD: ret = collect_one_file_entry(fe, fe->sgfd->id, &fe->sgfd->base, &signalfd_cinfo); break; case FD_TYPES__TUNF: ret = collect_one_file_entry(fe, fe->tunf->id, &fe->tunf->base, &tunfile_cinfo); break; case FD_TYPES__TIMERFD: ret = collect_one_file_entry(fe, fe->tfd->id, &fe->tfd->base, &timerfd_cinfo); break; case FD_TYPES__INOTIFY: ret = collect_one_file_entry(fe, fe->ify->id, &fe->ify->base, &inotify_cinfo); break; case FD_TYPES__FANOTIFY: ret = collect_one_file_entry(fe, fe->ffy->id, &fe->ffy->base, &fanotify_cinfo); break; case FD_TYPES__EXT: ret = collect_one_file_entry(fe, fe->ext->id, &fe->ext->base, &ext_file_cinfo); break; case FD_TYPES__UNIXSK: ret = collect_one_file_entry(fe, fe->usk->id, &fe->usk->base, &unix_sk_cinfo); break; case FD_TYPES__FIFO: ret = collect_one_file_entry(fe, fe->fifo->id, &fe->fifo->base, &fifo_cinfo); break; case FD_TYPES__PIPE: ret = collect_one_file_entry(fe, fe->pipe->id, &fe->pipe->base, &pipe_cinfo); break; case FD_TYPES__TTY: ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); break; case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); break; #endif } return ret; } struct collect_image_info files_cinfo = { .fd_type = CR_FD_FILES, .pb_type = PB_FILE, .priv_size = 0, .collect = collect_one_file, .flags = COLLECT_NOFREE, }; int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); return collect_image(&files_cinfo); }