#include #include #include #include #include #include #include #include #include #include #include #include #include "types.h" #include "common/list.h" #include "util.h" #include "mount.h" #include "filesystems.h" #include "mman.h" #include "cpu.h" #include "file-lock.h" #include "pstree.h" #include "fsnotify.h" #include "posix-timer.h" #include "kerndat.h" #include "vdso.h" #include "vma.h" #include "mem.h" #include "bfd.h" #include "proc_parse.h" #include "fdinfo.h" #include "parasite.h" #include "cr_options.h" #include "sysfs_parse.h" #include "seccomp.h" #include "string.h" #include "namespaces.h" #include "cgroup.h" #include "cgroup-props.h" #include "timerfd.h" #include "path.h" #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" #include "plugin.h" #include #ifndef SIGEV_SIGNAL #define SIGEV_SIGNAL 0 /* notify via signal */ #endif #ifndef SIGEV_NONE #define SIGEV_NONE 1 /* other notification: meaningless */ #endif #ifndef SIGEV_THREAD #define SIGEV_THREAD 2 /* deliver via thread creation */ #endif #ifndef SIGEV_THREAD_ID #define SIGEV_THREAD_ID 4 /* deliver to thread */ #endif #define BUF_SIZE 4096 /* Good enough value - can be changed */ struct buffer { char buf[BUF_SIZE]; char end; /* '\0' */ }; static struct buffer __buf; static char *buf = __buf.buf; /* * This is how AIO ring buffers look like in proc */ #define AIO_FNAME "/[aio]" /* check the @line starts with "%lx-%lx" format */ static bool __is_vma_range_fmt(char *line) { #define ____is_vma_addr_char(__c) (((__c) <= '9' && (__c) >= '0') || ((__c) <= 'f' && (__c) >= 'a')) while (*line && ____is_vma_addr_char(*line)) line++; if (*line++ != '-') return false; while (*line && ____is_vma_addr_char(*line)) line++; if (*line++ != ' ') return false; return true; #undef ____is_vma_addr_char } bool is_vma_range_fmt(char *line) { return __is_vma_range_fmt(line); } bool handle_vma_plugin(int *fd, struct stat *stat) { int ret; ret = run_plugins(HANDLE_DEVICE_VMA, *fd, stat); if (ret < 0) { pr_perror("handle_device_vma plugin failed"); return false; } return true; } static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { char *tok; if (!buf[0]) return; tok = strtok(buf, " \n"); if (!tok) return; #define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) do { /* mmap() block */ if (_vmflag_match(tok, "gd")) *flags |= MAP_GROWSDOWN; else if (_vmflag_match(tok, "lo")) *flags |= MAP_LOCKED; else if (_vmflag_match(tok, "nr")) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; /* madvise() block */ if (_vmflag_match(tok, "sr")) *madv |= (1ul << MADV_SEQUENTIAL); else if (_vmflag_match(tok, "rr")) *madv |= (1ul << MADV_RANDOM); else if (_vmflag_match(tok, "dc")) *madv |= (1ul << MADV_DONTFORK); else if (_vmflag_match(tok, "dd")) *madv |= (1ul << MADV_DONTDUMP); else if (_vmflag_match(tok, "mg")) *madv |= (1ul << MADV_MERGEABLE); else if (_vmflag_match(tok, "hg")) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; /* * Anything else is just ignored. */ } while ((tok = strtok(NULL, " \n"))); #undef _vmflag_match } void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { __parse_vmflags(buf, flags, madv, io_pf); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP */ if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) vma_area->e->has_madv = true; } static inline int is_anon_shmem_map(dev_t dev) { return kdat.shmem_dev == dev; } struct vma_file_info { int dev_maj; int dev_min; unsigned long ino; struct vma_area *vma; }; static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b) { return ((a->ino ^ b->ino) | (a->dev_maj ^ b->dev_maj) | (a->dev_min ^ b->dev_min)) == 0; } static int vma_get_mapfile_flags(struct vma_area *vma, DIR *mfd, char *path) { struct stat stat; if (fstatat(dirfd(mfd), path, &stat, AT_SYMLINK_NOFOLLOW) < 0) { if (errno == ENOENT) { /* Just mapping w/o map_files link */ return 0; } pr_perror("Failed fstatat on map %" PRIx64 "", vma->e->start); return -1; } switch (stat.st_mode & 0600) { case 0200: vma->e->fdflags = O_WRONLY; break; case 0400: vma->e->fdflags = O_RDONLY; break; case 0600: vma->e->fdflags = O_RDWR; break; } vma->e->has_fdflags = true; return 0; } static int vma_stat(struct vma_area *vma, int fd) { vma->vmst = xmalloc(sizeof(struct stat)); if (!vma->vmst) return -1; /* * For AUFS support, we need to check if the symbolic link * points to a branch. If it does, we cannot fstat() its file * descriptor because it would return a different dev/ino than * the real file. If fixup_aufs_vma_fd() returns positive, * it means that it has stat()'ed using the full pathname. * Zero return means that the symbolic link does not point to * a branch and we can do fstat() below. */ if (opts.aufs) { int ret; ret = fixup_aufs_vma_fd(vma, fd); if (ret < 0) return -1; if (ret > 0) return 0; } if (fstat(fd, vma->vmst) < 0) { pr_perror("Failed fstat on map %" PRIx64 "", vma->e->start); return -1; } return 0; } static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vma_file_info *vfi, int *vm_file_fd, const char *path) { int fd, hugetlb_flag = 0; dev_t vfi_dev; /* * Kernel prohibits reading map_files for users. The * best we can do here is fill stat using the information * from smaps file and ... hope for the better :\ * * Here we'll miss AIO-s and sockets :( */ if (fname[0] == '\0') { /* * Another bad thing is that kernel first checks * for permission access to ANY map_files link, * then checks for its existence. So we have to * check for file path being empty to "emulate" * the ENOENT case. */ if (vfi->dev_maj != 0 || vfi->dev_min != 0 || vfi->ino != 0) { pr_err("Strange file mapped at %lx [%s]:%d.%d.%ld\n", (unsigned long)vma->e->start, fname, vfi->dev_maj, vfi->dev_min, vfi->ino); return -1; } return 0; } else if (fname[0] != '/') { /* * This should be some kind of * special mapping like [heap], [vdso] * and such, the caller should take care * of the @fname and vma status. */ return 0; } vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); if (is_memfd(vfi_dev)) { char tmp[PATH_MAX]; strlcpy(tmp, fname, PATH_MAX); strip_deleted(tmp, strlen(tmp)); /* * The error EPERM will be shown in the following pr_perror(). * It comes from the previous open() call. */ pr_perror("Can't open mapped [%s]", tmp); /* * TODO Perhaps we could do better than failing and dump the * memory like what is being done in shmem.c */ return -1; } if (is_hugetlb_dev(vfi_dev, &hugetlb_flag) || is_anon_shmem_map(vfi_dev)) { if (!(vma->e->flags & MAP_SHARED)) vma->e->status |= VMA_ANON_PRIVATE; else vma->e->status |= VMA_ANON_SHARED; vma->e->flags |= MAP_ANONYMOUS; vma->e->shmid = vfi->ino; vma->e->flags |= hugetlb_flag; if (!strncmp(fname, "/SYSV", 5)) { vma->e->status |= VMA_AREA_SYSVIPC; } else if (vma->e->flags & MAP_SHARED) { if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } return 0; } pr_info("Failed to open map_files/%s, try to go via [%s] path\n", path, fname); fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); return -1; } if (vma_stat(vma, fd)) { close(fd); return -1; } if (vma->vmst->st_dev != vfi_dev || vma->vmst->st_ino != vfi->ino) { pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); close(fd); return -1; } *vm_file_fd = fd; return 0; } static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, struct vma_file_info *prev_vfi, int *vm_file_fd) { char path[32]; int flags; /* Figure out if it's file mapping */ snprintf(path, sizeof(path), "%" PRIx64 "-%" PRIx64, vma->e->start, vma->e->end); if (vma_get_mapfile_flags(vma, mfd, path)) return -1; if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) { struct vma_area *prev = prev_vfi->vma; /* * If vfi is equal (!) and negative @vm_file_fd -- * we have nothing to borrow for sure. */ if (*vm_file_fd < 0) return 0; pr_debug("vma %" PRIx64 " borrows vfi from previous %" PRIx64 "\n", vma->e->start, prev->e->start); if (prev->e->status & VMA_AREA_SOCKET) vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; /* * FIXME -- in theory there can be vmas that have * dev:ino match, but live in different mount * namespaces. However, we only borrow files for * subsequent vmas. These are _very_ likely to * have files from the same namespaces. */ vma->file_borrowed = true; return 0; } close_safe(vm_file_fd); /* * Note that we "open" it in dumper process space * so later we might refer to it via /proc/self/fd/vm_file_fd * if needed. */ flags = O_PATH; if (vfi->dev_maj == 0) /* * Opening with O_PATH omits calling kernel ->open * method, thus for some special files their type * detection might be broken. Thus we open those with * the O_RDONLY to potentially get ENXIO and check * it below. */ flags = O_RDONLY; *vm_file_fd = openat(dirfd(mfd), path, flags); if (*vm_file_fd < 0) { if (errno == ENOENT) /* Just mapping w/o map_files link */ return 0; if (errno == ENXIO) { struct stat buf; if (fstatat(dirfd(mfd), path, &buf, 0)) return -1; if (S_ISSOCK(buf.st_mode)) { pr_info("Found socket mapping @%" PRIx64 "\n", vma->e->start); vma->vm_socket_id = buf.st_ino; vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; return 0; } if ((buf.st_mode & S_IFMT) == 0 && !strncmp(fname, AIO_FNAME, sizeof(AIO_FNAME) - 1)) { /* AIO ring, let's try */ close_safe(vm_file_fd); vma->e->status = VMA_AREA_AIORING; return 0; } pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); return -1; } if (errno == EPERM && !opts.aufs) return vma_get_mapfile_user(fname, vma, vfi, vm_file_fd, path); pr_perror("Can't open map_files"); return -1; } return vma_stat(vma, *vm_file_fd); } int parse_self_maps_lite(struct vm_area_list *vms) { struct vma_area *prev = NULL; struct bfd maps; char *buf; vm_area_list_init(vms); maps.fd = open_proc(PROC_SELF, "maps"); if (maps.fd < 0) return -1; if (bfdopenr(&maps)) return -1; while (1) { struct vma_area *vma; char *end; unsigned long s, e; buf = breadline(&maps); if (!buf) break; if (IS_ERR(buf)) goto err; s = strtoul(buf, &end, 16); e = strtoul(end + 1, NULL, 16); if (prev && prev->e->end == s) /* * This list is needed for one thing only -- to * get the idea of what parts of current address * space are busy. So merge them altogether. */ prev->e->end = e; else { vma = alloc_vma_area(); if (!vma) goto err; vma->e->start = s; vma->e->end = e; list_add_tail(&vma->list, &vms->h); vms->nr++; prev = vma; } pr_debug("Parsed %" PRIx64 "-%" PRIx64 " vma\n", prev->e->start, prev->e->end); } bclose(&maps); return 0; err: bclose(&maps); return -1; } static inline int handle_vdso_vma(struct vma_area *vma) { vma->e->status |= VMA_AREA_REGULAR; if ((vma->e->prot & VDSO_PROT) == VDSO_PROT) vma->e->status |= VMA_AREA_VDSO; return 0; } static inline int handle_vvar_vma(struct vma_area *vma) { vma->e->status |= VMA_AREA_REGULAR; if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) vma->e->status |= VMA_AREA_VVAR; return 0; } static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_path, DIR *map_files_dir, struct vma_file_info *vfi, struct vma_file_info *prev_vfi, int *vm_file_fd) { if (vma_get_mapfile(file_path, vma_area, map_files_dir, vfi, prev_vfi, vm_file_fd)) goto err_bogus_mapfile; if (vma_area->e->status != 0) return 0; if (!strcmp(file_path, "[vsyscall]") || !strcmp(file_path, "[vectors]")) { vma_area->e->status |= VMA_AREA_VSYSCALL; } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[vvar]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; } else { vma_area->e->status = VMA_AREA_REGULAR; } /* * Some mapping hints for restore, we save this on * disk and restore might need to analyze it. */ if (vma_area->file_borrowed) { struct vma_area *prev = prev_vfi->vma; /* * Pick-up flags that might be set in the branch below. * Status is copied as-is as it should be zero here, * and have full match with the previous. */ vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS); vma_area->e->status = prev->e->status; vma_area->e->shmid = prev->e->shmid; vma_area->vmst = prev->vmst; vma_area->mnt_id = prev->mnt_id; if (!(vma_area->e->status & VMA_AREA_SYSVIPC)) { vma_area->e->status &= ~(VMA_FILE_PRIVATE | VMA_FILE_SHARED); if (vma_area->e->flags & MAP_PRIVATE) vma_area->e->status |= VMA_FILE_PRIVATE; else vma_area->e->status |= VMA_FILE_SHARED; } } else if (*vm_file_fd >= 0) { struct stat *st_buf = vma_area->vmst; int hugetlb_flag = 0; if (S_ISREG(st_buf->st_mode)) { /* regular file mapping -- supported */; pr_debug("Found regular file mapping, OK\n"); } else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) { /* devzero mapping -- also makes sense */; pr_debug("Found devzero mapping, OK\n"); } else if (handle_vma_plugin(vm_file_fd, st_buf)) { pr_info("Found device file mapping, plugin is available\n"); vma_area->e->status |= VMA_EXT_PLUGIN; } else { /* non-regular mapping with no supporting plugin */ pr_err("Can't handle non-regular mapping on %d's map %" PRIx64 "\n", pid, vma_area->e->start); goto err; } if ((is_anon_shmem_map(st_buf->st_dev) || is_hugetlb_dev(st_buf->st_dev, NULL)) && !strncmp(file_path, "/SYSV", 5)) { vma_area->e->flags |= MAP_ANONYMOUS; vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = st_buf->st_ino; if (!(vma_area->e->flags & MAP_SHARED)) goto err_bogus_mapping; pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } else { /* We dump memfd backed mapping, both normal and hugepage anonymous share * mapping using memfd approach when possible. */ if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) || can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) { vma_area->e->status |= VMA_AREA_MEMFD; vma_area->e->flags |= hugetlb_flag; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) { vma_area->e->flags |= hugetlb_flag; vma_area->e->flags |= MAP_ANONYMOUS; if (vma_area->e->flags & MAP_SHARED) { vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = st_buf->st_ino; } else { vma_area->e->status |= VMA_ANON_PRIVATE; } close_safe(vm_file_fd); return 0; } if (vma_area->e->flags & MAP_PRIVATE) vma_area->e->status |= VMA_FILE_PRIVATE; else vma_area->e->status |= VMA_FILE_SHARED; } /* * We cannot use the mnt_id value provided by the kernel * for vm_file_fd if it is an AUFS file (the value is * wrong). In such a case, fixup_aufs_vma_fd() has set * mnt_id to -1 to mimic pre-3.15 kernels that didn't * have mnt_id. */ if (vma_area->mnt_id != -1 && get_fd_mntid(*vm_file_fd, &vma_area->mnt_id)) return -1; } else { /* * No file but mapping -- anonymous one. */ if (vma_area->e->flags & MAP_SHARED) { vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = vfi->ino; } else { vma_area->e->status |= VMA_ANON_PRIVATE; } vma_area->e->flags |= MAP_ANONYMOUS; } return 0; err: return -1; err_bogus_mapping: pr_err("Bogus mapping 0x%" PRIx64 "-0x%" PRIx64 " (flags: %#x vm_file_fd: %d)\n", vma_area->e->start, vma_area->e->end, vma_area->e->flags, *vm_file_fd); goto err; err_bogus_mapfile: pr_perror("Can't open %d's mapfile link %" PRIx64, pid, vma_area->e->start); goto err; } static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area_list, unsigned long *prev_end, struct vma_file_info *vfi, struct vma_file_info *prev_vfi) { if (vma_area->e->status & VMA_EXT_PLUGIN) { /* Unsupported VMAs that provide special plugins for * backup can be treated as regular VMAs and criu * should only save their metadata in the dump files. * There can be several special backup plugins hooks * that might run at different stages during checkpoint * and restore. */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); return -1; } /* Add a guard page only if here is enough space for it */ if (vma_has_guard_gap_hidden(vma_area) && *prev_end < vma_area->e->start) vma_area->e->start -= PAGE_SIZE; /* Guard page */ *prev_end = vma_area->e->end; list_add_tail(&vma_area->list, &vma_area_list->h); vma_area_list->nr++; if (vma_area_is_private(vma_area, kdat.task_size)) { unsigned long pages; pages = vma_area_len(vma_area) / PAGE_SIZE; vma_area_list->nr_priv_pages += pages; vma_area_list->nr_priv_pages_longest = max(vma_area_list->nr_priv_pages_longest, pages); } else if (vma_area_is(vma_area, VMA_ANON_SHARED)) { unsigned long pages; pages = vma_area_len(vma_area) / PAGE_SIZE; vma_area_list->nr_shared_pages_longest = max(vma_area_list->nr_shared_pages_longest, pages); } *prev_vfi = *vfi; prev_vfi->vma = vma_area; return 0; } /* * On s390 we have old kernels where the global task size assumption of * criu does not work. See also compel_task_size() for s390. */ static int task_size_check(pid_t pid, VmaEntry *entry) { #ifdef __s390x__ if (entry->end <= kdat.task_size) return 0; pr_err("Can't dump high memory region %lx-%lx of task %d because kernel commit ee71d16d22bb is missing\n", entry->start, entry->end, pid); return -1; #else return 0; #endif } int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { struct vma_area *vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; struct vma_file_info vfi; struct vma_file_info prev_vfi = {}; DIR *map_files_dir = NULL; struct bfd f; vm_area_list_init(vma_area_list); f.fd = open_proc(pid, "smaps"); if (f.fd < 0) goto err_n; if (bfdopenr(&f)) goto err_n; map_files_dir = opendir_proc(pid, "map_files"); if (!map_files_dir) /* old kernel? */ goto err; while (1) { int num, path_off; bool eof; char *str; str = breadline(&f); if (IS_ERR(str)) goto err; eof = (str == NULL); if (!eof && !__is_vma_range_fmt(str)) { if (!strncmp(str, "VmFlags: ", 9)) { BUG_ON(!vma_area); parse_vma_vmflags(&str[9], vma_area); continue; } else continue; } if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) goto err; if (eof) break; vma_area = alloc_vma_area(); if (!vma_area) goto err; num = sscanf(str, "%lx-%lx %c%c%c%c %lx %x:%x %lu %n", &start, &end, &r, &w, &x, &s, &pgoff, &vfi.dev_maj, &vfi.dev_min, &vfi.ino, &path_off); if (num < 10) { pr_err("Can't parse: %s\n", str); goto err; } vma_area->e->start = start; vma_area->e->end = end; vma_area->e->pgoff = pgoff; vma_area->e->prot = PROT_NONE; if (task_size_check(pid, vma_area->e)) goto err; if (r == 'r') vma_area->e->prot |= PROT_READ; if (w == 'w') vma_area->e->prot |= PROT_WRITE; if (x == 'x') vma_area->e->prot |= PROT_EXEC; if (s == 's') vma_area->e->flags = MAP_SHARED; else if (s == 'p') vma_area->e->flags = MAP_PRIVATE; else { pr_err("Unexpected VMA met (%c)\n", s); goto err; } if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) goto err; } else if (vma_entry_is(vma_area->e, VMA_AREA_AIORING)) vma_area_list->nr_aios++; } vma_area = NULL; ret = 0; err: bclose(&f); err_n: close_safe(&vm_file_fd); if (map_files_dir) closedir(map_files_dir); xfree(vma_area); return ret; } int parse_pid_stat(pid_t pid, struct proc_pid_stat *s) { char *tok, *p; int fd; int n; fd = open_proc(pid, "stat"); if (fd < 0) return -1; n = read(fd, buf, BUF_SIZE); close(fd); if (n < 1) { pr_err("stat for %d is corrupted\n", pid); return -1; } memset(s, 0, sizeof(*s)); tok = strchr(buf, ' '); if (!tok) goto err; *tok++ = '\0'; if (*tok != '(') goto err; s->pid = atoi(buf); p = strrchr(tok + 1, ')'); if (!p) goto err; *tok = '\0'; *p = '\0'; strlcpy(s->comm, tok + 1, sizeof(s->comm)); n = sscanf(p + 1, " %c %d %d %d %d %d %u %lu %lu %lu %lu " "%lu %lu %ld %ld %ld %ld %d %d %llu %lu %ld %lu %lu %lu %lu " "%lu %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld " "%lu %lu %lu %lu %lu %lu %lu %d", &s->state, &s->ppid, &s->pgid, &s->sid, &s->tty_nr, &s->tty_pgrp, &s->flags, &s->min_flt, &s->cmin_flt, &s->maj_flt, &s->cmaj_flt, &s->utime, &s->stime, &s->cutime, &s->cstime, &s->priority, &s->nice, &s->num_threads, &s->zero0, &s->start_time, &s->vsize, &s->mm_rss, &s->rsslim, &s->start_code, &s->end_code, &s->start_stack, &s->esp, &s->eip, &s->sig_pending, &s->sig_blocked, &s->sig_ignored, &s->sig_handled, &s->wchan, &s->zero1, &s->zero2, &s->exit_signal, &s->task_cpu, &s->rt_priority, &s->policy, &s->delayacct_blkio_ticks, &s->gtime, &s->cgtime, &s->start_data, &s->end_data, &s->start_brk, &s->arg_start, &s->arg_end, &s->env_start, &s->env_end, &s->exit_code); if (n < 50) goto err; return 0; err: pr_err("Parsing %d's stat failed (#fields do not match)\n", pid); return -1; } int prepare_loginuid(unsigned int value) { int fd, ret = 0; char buf[11]; /* 4294967295 is maximum for u32 */ fd = open_proc_rw(PROC_SELF, "loginuid"); if (fd < 0) return -1; snprintf(buf, 11, "%u", value); if (write(fd, buf, 11) < 0) { pr_warn("Write %s to /proc/self/loginuid failed: %s\n", buf, strerror(errno)); ret = -1; } close(fd); return ret; } unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent) { int fd; ssize_t num; *err = 0; fd = __open_proc(pid, (ignore_noent) ? ENOENT : 0, O_RDONLY, "loginuid"); if (fd < 0) goto out; num = read(fd, buf, 10); close(fd); if (num < 0) { pr_perror("Unable to read /proc/%d/loginuid", pid); goto out; } buf[num] = '\0'; return strtol(buf, NULL, 10); out: *err = -1; return INVALID_UID; /* unset value */ } int parse_pid_oom_score_adj(pid_t pid, int *err) { int fd; ssize_t num; *err = 0; fd = open_proc(pid, "oom_score_adj"); if (fd < 0) goto out; num = read(fd, buf, 10); close(fd); if (num < 0) { pr_perror("Unable to read /proc/%d/oom_score_adj", pid); goto out; } buf[num] = '\0'; return strtol(buf, NULL, 10); out: *err = -1; return 0; } static int ids_parse(char *str, unsigned int *arr) { char *end; arr[0] = strtol(str, &end, 10); arr[1] = strtol(end + 1, &end, 10); arr[2] = strtol(end + 1, &end, 10); arr[3] = strtol(end + 1, &end, 10); if (*end) return -1; else return 0; } static int cap_parse(char *str, unsigned int *res) { int i, ret; for (i = 0; i < PROC_CAP_SIZE; i++) { ret = sscanf(str, "%08x", &res[PROC_CAP_SIZE - 1 - i]); if (ret != 1) return -1; str += 8; } return 0; } int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) { struct proc_status_creds *cr = container_of(ss, struct proc_status_creds, s); struct bfd f; int done = 0; int ret = -1; char *str; bool parsed_seccomp = false; int expected_done; f.fd = open_proc(pid, "status"); if (f.fd < 0) return -1; cr->s.sigpnd = 0; cr->s.shdpnd = 0; cr->s.sigblk = 0; cr->s.seccomp_mode = SECCOMP_MODE_DISABLED; if (bfdopenr(&f)) return -1; while (done < 14) { str = breadline(&f); if (str == NULL) break; if (IS_ERR(str)) goto err_parse; if (!strncmp(str, "State:", 6)) { cr->s.state = str[7]; done++; continue; } if (!strncmp(str, "PPid:", 5)) { if (sscanf(str, "PPid:\t%d", &cr->s.ppid) != 1) { pr_err("Unable to parse: %s\n", str); goto err_parse; } done++; continue; } if (!strncmp(str, "NSpid:", 6)) { /* Get a thread ID in the thread PID namespace. */ char *last; last = strrchr(str, '\t'); if (!last || sscanf(last, "%d", &cr->s.vpid) != 1) { pr_err("Unable to parse: %s\n", str); goto err_parse; } done++; continue; } if (!strncmp(str, "Uid:", 4)) { if (ids_parse(str + 5, cr->uids)) goto err_parse; done++; continue; } if (!strncmp(str, "Gid:", 4)) { if (ids_parse(str + 5, cr->gids)) goto err_parse; done++; continue; } if (!strncmp(str, "CapInh:", 7)) { if (cap_parse(str + 8, cr->cap_inh)) goto err_parse; done++; continue; } if (!strncmp(str, "CapEff:", 7)) { if (cap_parse(str + 8, cr->cap_eff)) goto err_parse; done++; continue; } if (!strncmp(str, "CapPrm:", 7)) { if (cap_parse(str + 8, cr->cap_prm)) goto err_parse; done++; continue; } if (!strncmp(str, "CapBnd:", 7)) { if (cap_parse(str + 8, cr->cap_bnd)) goto err_parse; done++; continue; } if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; } parsed_seccomp = true; done++; continue; } if (!strncmp(str, "ShdPnd:", 7)) { unsigned long long sigpnd; if (sscanf(str + 7, "%llx", &sigpnd) != 1) goto err_parse; cr->s.shdpnd |= sigpnd; done++; continue; } if (!strncmp(str, "SigPnd:", 7)) { unsigned long long sigpnd; if (sscanf(str + 7, "%llx", &sigpnd) != 1) goto err_parse; cr->s.sigpnd |= sigpnd; done++; continue; } if (!strncmp(str, "SigBlk:", 7)) { unsigned long long sigblk = 0; if (sscanf(str + 7, "%llx", &sigblk) != 1) goto err_parse; cr->s.sigblk |= sigblk; done++; continue; } } /* seccomp and nspids are optional */ expected_done = (parsed_seccomp ? 12 : 11); if (kdat.has_nspid) expected_done++; if (done == expected_done) ret = 0; err_parse: if (ret) pr_err("Error parsing proc status file\n"); bclose(&f); return ret; } struct opt2flag { char *opt; unsigned flag; }; static bool sb_opt_cb(char *opt, char *unknown, size_t *uoff) { unsigned int id; if (sscanf(opt, "gid=%d", &id) == 1) { *uoff += sprintf(unknown + *uoff, "gid=%d", userns_gid(id)); unknown[*uoff] = ','; (*uoff)++; return true; } else if (sscanf(opt, "uid=%d", &id) == 1) { *uoff += sprintf(unknown + *uoff, "uid=%d", userns_uid(id)); unknown[*uoff] = ','; (*uoff)++; return true; } return false; } static int do_opt2flag(char *opt, unsigned *flags, const struct opt2flag *opts, char *unknown, bool (*cb)(char *opt, char *unknown, size_t *uoff)) { int i; char *end; size_t uoff = 0; while (1) { end = strchr(opt, ','); if (end) *end = '\0'; for (i = 0; opts[i].opt != NULL; i++) if (!strcmp(opts[i].opt, opt)) { (*flags) |= opts[i].flag; break; } if (opts[i].opt == NULL && cb && !cb(opt, unknown, &uoff)) { if (!unknown) { pr_err("Unknown option [%s]\n", opt); return -1; } strcpy(unknown + uoff, opt); uoff += strlen(opt); unknown[uoff] = ','; uoff++; } if (!end) { if (uoff) uoff--; if (unknown) unknown[uoff] = '\0'; break; } else opt = end + 1; } return 0; } static int parse_mnt_flags(char *opt, unsigned *flags) { static const struct opt2flag mnt_opt2flag[] = { { "rw", 0, }, { "ro", MS_RDONLY, }, { "nosuid", MS_NOSUID, }, { "nodev", MS_NODEV, }, { "noexec", MS_NOEXEC, }, { "noatime", MS_NOATIME, }, { "nodiratime", MS_NODIRATIME, }, { "relatime", MS_RELATIME, }, {}, }; if (do_opt2flag(opt, flags, mnt_opt2flag, NULL, NULL)) return -1; /* Otherwise the kernel assumes RELATIME by default */ if ((*flags & (MS_RELATIME | MS_NOATIME)) == 0) *flags |= MS_STRICTATIME; return 0; } static int parse_sb_opt(char *opt, unsigned *flags, char *uopt) { static const struct opt2flag sb_opt2flag[] = { { "rw", 0, }, { "ro", MS_RDONLY, }, { "sync", MS_SYNC, }, { "dirsync", MS_DIRSYNC, }, { "mad", MS_MANDLOCK, }, {}, }; return do_opt2flag(opt, flags, sb_opt2flag, uopt, sb_opt_cb); } static int parse_mnt_opt(char *str, struct mount_info *mi, int *off) { char *istr = str, *end; while (1) { end = strchr(str, ' '); if (!end) { pr_err("Error parsing mount options\n"); return -1; } *end = '\0'; if (!strncmp(str, "-", 1)) break; else if (!strncmp(str, "shared:", 7)) { mi->flags |= MS_SHARED; mi->shared_id = atoi(str + 7); } else if (!strncmp(str, "master:", 7)) { mi->flags |= MS_SLAVE; mi->master_id = atoi(str + 7); } else if (!strncmp(str, "propagate_from:", 15)) { /* skip */; } else if (!strncmp(str, "unbindable", 11)) mi->flags |= MS_UNBINDABLE; else { pr_err("Unknown option [%s]\n", str); return -1; } str = end + 1; } *off = end - istr + 1; return 0; } /* * mountinfo contains mangled paths. space, tab and back slash were replaced * with usual octal escape. This function replaces these symbols back. */ static void cure_path(char *path) { int i, len, off = 0; if (strchr(path, '\\') == NULL) /* fast path */ return; len = strlen(path); for (i = 0; i < len; i++) { if (!strncmp(path + i, "\\040", 4)) { path[i - off] = ' '; goto replace; } else if (!strncmp(path + i, "\\011", 4)) { path[i - off] = '\t'; goto replace; } else if (!strncmp(path + i, "\\134", 4)) { path[i - off] = '\\'; goto replace; } if (off) path[i - off] = path[i]; continue; replace: off += 3; i += 3; } path[len - off] = 0; } static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) { unsigned int kmaj, kmin; int ret, n, len; char *sub, *opt = NULL; char link_path[PATH_MAX]; new->mountpoint = xmalloc(PATH_MAX); if (new->mountpoint == NULL) goto err; new->mountpoint[0] = '.'; ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; cure_path(new->mountpoint); cure_path(new->root); len = strlen(new->root); if (len >= PATH_MAX - 1) { pr_err("new root path (%s) exceeds %d\n", new->root, PATH_MAX); goto err; } strcpy(link_path, new->root); if (strip_deleted(link_path, len)) { strcpy(new->root, link_path); new->deleted = true; } new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1); if (!new->mountpoint) goto err; new->ns_mountpoint = new->mountpoint; new->is_ns_root = is_root(new->ns_mountpoint + 1); new->s_dev = new->s_dev_rt = MKKDEV(kmaj, kmin); new->flags = 0; if (parse_mnt_flags(opt, &new->flags)) goto err; free(opt); /* we are going to reallocate/reuse this buffer */ opt = NULL; str += n; if (parse_mnt_opt(str, new, &n)) goto err; str += n; ret = sscanf(str, "%ms %ms %ms", fsname, &new->source, &opt); if (ret == 2) { /* src may be empty */ opt = new->source; new->source = xstrdup(""); if (new->source == NULL) goto err; } else if (ret != 3) goto err; cure_path(new->source); new->fsname = xstrdup(*fsname); if (!new->fsname) goto err; /* * The kernel reports "subtypes" sometimes and the valid * type-vs-subtype delimiter is the dot symbol. We disregard * any subtypes for the purpose of finding the fstype. */ sub = strchr(*fsname, '.'); if (sub) *sub = 0; new->fstype = find_fstype_by_name(*fsname); new->options = xmalloc(strlen(opt) + 1); if (!new->options) goto err; if (parse_sb_opt(opt, &new->sb_flags, new->options)) goto err; ret = 0; ret: xfree(opt); return ret; err: ret = -1; goto ret; } static LIST_HEAD(skip_mount_list); struct str_node { struct list_head node; char string[]; }; bool add_skip_mount(const char *mountpoint) { struct str_node *skip = xmalloc(sizeof(struct str_node) + strlen(mountpoint) + 1); if (!skip) return false; strcpy(skip->string, mountpoint); list_add(&skip->node, &skip_mount_list); return true; } static bool should_skip_mount(char *mountpoint) { struct str_node *pos; list_for_each_entry(pos, &skip_mount_list, node) { if (is_same_path(mountpoint, pos->string)) return true; } return false; } int parse_timens_offsets(struct timespec *boff, struct timespec *moff) { int exit_code = -1; FILE *f; f = fopen_proc(PROC_SELF, "timens_offsets"); if (!f) { pr_perror("Unable to open /proc/self/timens_offsets"); return exit_code; } while (fgets(buf, BUF_SIZE, f)) { int64_t sec, nsec; char clockid[10]; if (sscanf(buf, "%9s %" PRId64 " %" PRId64 "\n", clockid, &sec, &nsec) != 3) { pr_err("Unable to parse: %s\n", buf); goto out; } clockid[sizeof(clockid) - 1] = 0; if (strcmp(clockid, "monotonic") == 0 || strcmp(clockid, __stringify(CLOCK_MONOTONIC)) == 0) { moff->tv_sec = sec; moff->tv_nsec = nsec; continue; } if (strcmp(clockid, "boottime") == 0 || strcmp(clockid, __stringify(CLOCK_BOOTTIME)) == 0) { boff->tv_sec = sec; boff->tv_nsec = nsec; continue; } pr_err("Unknown clockid: %s\n", clockid); goto out; } exit_code = 0; out: fclose(f); return exit_code; } static int get_mountinfo_sdev_from_mntid(int mnt_id, unsigned int *sdev) { int exit_code = -1; FILE *f; f = fopen_proc(PROC_SELF, "mountinfo"); if (!f) return -1; while (fgets(buf, BUF_SIZE, f)) { unsigned int kmaj, kmin; int id; if (sscanf(buf, "%i %*i %u:%u", &id, &kmaj, &kmin) != 3) { pr_err("Failed to parse mountinfo line %s\n", buf); goto err; } if (id == mnt_id) { *sdev = MKKDEV(kmaj, kmin); exit_code = 0; break; } } err: fclose(f); return exit_code; } /* This works even on btrfs where stat does not show right sdev */ int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo) { struct mount_info *mi; int ret, mnt_id; ret = get_fd_mntid(fd, &mnt_id); if (ret < 0) return -1; /* Simple case mnt_id is in dumped mntns */ mi = lookup_mnt_id(mnt_id); if (mi) { *sdev = mi->s_dev_rt; return 0; } if (!parse_mountinfo) return -1; /* Complex case mnt_id is in mntns created by criu */ return get_mountinfo_sdev_from_mntid(mnt_id, sdev); } struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) { struct mount_info *list = NULL; FILE *f; f = fopen_proc(pid, "mountinfo"); if (!f) return NULL; while (fgets(buf, BUF_SIZE, f)) { struct mount_info *new; int ret = -1; char *fsname = NULL; new = mnt_entry_alloc(false); if (!new) goto end; new->nsid = nsid; ret = parse_mountinfo_ent(buf, new, &fsname); if (ret < 0) { pr_err("Bad format in %d mountinfo: '%s'\n", pid, buf); goto end; } /* * Drop this mountpoint early, so that lookup_mnt_id/etc will * fail loudly at "dump" stage if an opened file or another mnt * depends on this one. */ if (for_dump && should_skip_mount(new->ns_mountpoint)) { pr_info("\tskip %s @ %s\n", fsname, new->ns_mountpoint); mnt_entry_free(new); new = NULL; goto end; } pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n", fsname, new->source, new->mnt_id, new->s_dev, new->root, new->ns_mountpoint, new->flags, new->options); if (new->fstype->parse) { ret = new->fstype->parse(new); if (ret < 0) { pr_err("Failed to parse FS specific data on %s\n", service_mountpoint(new)); mnt_entry_free(new); new = NULL; goto end; } if (ret > 0) { pr_info("\tskipping fs mounted at %s\n", service_mountpoint(new) + 1); mnt_entry_free(new); new = NULL; ret = 0; goto end; } } end: if (fsname) free(fsname); if (new) mntinfo_add_list_before(&list, new); if (ret) goto err; } out: fclose(f); return list; err: while (list) { struct mount_info *next = list->next; mnt_entry_free(list); list = next; } goto out; } static char nybble(const char n) { if (n >= '0' && n <= '9') return n - '0'; else if (n >= 'A' && n <= 'F') return n - ('A' - 10); else if (n >= 'a' && n <= 'f') return n - ('a' - 10); return 0; } static void parse_fhandle_encoded(char *tok, FhEntry *fh) { char *d = (char *)fh->handle; int i = 0; memzero(d, pb_repeated_size(fh, handle)); while (*tok == ' ') tok++; while (*tok) { if (i >= pb_repeated_size(fh, handle)) break; d[i++] = (nybble(tok[0]) << 4) | nybble(tok[1]); if (tok[1]) tok += 2; else break; } } static int parse_timerfd(struct bfd *f, char *str, TimerfdEntry *tfy) { /* * Format is * clockid: 0 * ticks: 0 * settime flags: 01 * it_value: (0, 49406829) * it_interval: (1, 0) */ if (sscanf(str, "clockid: %d", &tfy->clockid) != 1) goto parse_err; if (verify_timerfd(tfy) < 0) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "ticks: %llu", (unsigned long long *)&tfy->ticks) != 1) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "settime flags: 0%o", &tfy->settime_flags) != 1) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "it_value: (%llu, %llu)", (unsigned long long *)&tfy->vsec, (unsigned long long *)&tfy->vnsec) != 2) goto parse_err; str = breadline(f); if (IS_ERR_OR_NULL(str)) goto nodata; if (sscanf(str, "it_interval: (%llu, %llu)", (unsigned long long *)&tfy->isec, (unsigned long long *)&tfy->insec) != 2) goto parse_err; return 0; parse_err: return -1; nodata: pr_err("No data left in proc file while parsing timerfd\n"); goto parse_err; } typedef struct bpfmap_fmt { char *fmt; void *value; /* * If newer kernels are adding additional entries, these entries need * to be marked as optional in the protobuf definition and the parsing * must be able to ignore it if running on an older kernel. */ protobuf_c_boolean *optional; } bpfmap_fmt; static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf) { /* * Format is: * * uint32_t map_type * uint32_t key_size * uint32_t value_size * uint32_t max_entries * uint32_t map_flags * uint64_t map_extra * uint64_t memlock * uint32_t map_id * boolean frozen */ /* This needs to be in the same order as in the fdinfo entry. */ bpfmap_fmt map[] = { { "map_type: %u", &bpf->map_type, NULL }, { "key_size: %u", &bpf->key_size, NULL }, { "value_size: %u", &bpf->value_size, NULL }, { "max_entries: %u", &bpf->max_entries, NULL }, { "map_flags: %" PRIx32 "", &bpf->map_flags, NULL }, { "map_extra: %" PRIx64 "", &bpf->map_extra, &bpf->has_map_extra }, { "memlock: %" PRIu64 "", &bpf->memlock, NULL }, { "map_id: %u", &bpf->map_id, NULL }, { "frozen: %d", &bpf->frozen, NULL }, }; size_t n = sizeof(map) / sizeof(bpfmap_fmt); int i; for (i = 0; i < n; i++) { if (sscanf(str, map[i].fmt, map[i].value) != 1) { if (map[i].optional) continue; return -1; } if (i == n - 1) break; str = breadline(f); if (IS_ERR_OR_NULL(str)) { pr_err("No data left in proc file while parsing bpfmap\n"); return -1; } } if (bpf->has_map_extra && bpf->map_extra) pr_warn("Non-zero value for fdinfo map_extra entry found. This will not be restored.\n"); return 0; } #define fdinfo_field(str, field) !strncmp(str, field ":", sizeof(field)) static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked); static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) { struct bfd f; char *str; bool entry_met = false; int ret, exit_code = -1; f.fd = open_proc(pid, "fdinfo/%d", fd); if (f.fd < 0) return -1; if (bfdopenr(&f)) return -1; while (1) { str = breadline(&f); if (!str) break; if (IS_ERR(str)) goto out; if (fdinfo_field(str, "pos") || fdinfo_field(str, "flags") || fdinfo_field(str, "mnt_id")) { unsigned long long val; struct fdinfo_common *fdinfo = arg; if (type != FD_TYPES__UND) continue; ret = sscanf(str, "%*s %lli", &val); if (ret != 1) goto parse_err; if (fdinfo_field(str, "pos")) fdinfo->pos = val; else if (fdinfo_field(str, "flags")) fdinfo->flags = val; else if (fdinfo_field(str, "mnt_id")) fdinfo->mnt_id = val; entry_met = true; continue; } if (fdinfo_field(str, "lock")) { struct file_lock *fl; struct fdinfo_common *fdinfo = arg; char *flock_status = str + sizeof("lock:\t") - 1; if (type != FD_TYPES__UND) continue; /* * The lock status can be empty when the owner of the * lock is invisible from our PID namespace. * This unfortunate behavior is fixed in kernels v4.19 * and up (see commit 1cf8e5de40). */ if (flock_status[0] == '\0') continue; fl = alloc_file_lock(); if (!fl) { pr_perror("Alloc file lock failed!"); goto out; } if (parse_file_lock_buf(flock_status, fl, 0)) { xfree(fl); goto parse_err; } pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", fl->fl_id, fl->fl_kind, fl->fl_ltype, fl->fl_owner, fl->maj, fl->min, fl->i_no, fl->start, fl->end); if (fl->fl_kind == FL_UNKNOWN) { pr_err("Unknown file lock!\n"); xfree(fl); goto out; } fl->real_owner = fdinfo->owner; fl->fl_holder = pid; fl->owners_fd = fd; list_add_tail(&fl->list, &file_lock_list); } if (type == FD_TYPES__UND) continue; if (fdinfo_field(str, "eventfd-count")) { EventfdFileEntry *efd = arg; if (type != FD_TYPES__EVENTFD) goto parse_err; ret = sscanf(str, "eventfd-count: %" PRIx64, &efd->counter); if (ret != 1) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "clockid")) { TimerfdEntry *tfe = arg; if (type != FD_TYPES__TIMERFD) goto parse_err; ret = parse_timerfd(&f, str, tfe); if (ret) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "tfd")) { EventpollFileEntry *epfe = arg; EventpollTfdEntry *e; int i; if (type != FD_TYPES__EVENTPOLL) goto parse_err; e = xmalloc(sizeof(EventpollTfdEntry)); if (!e) goto out; eventpoll_tfd_entry__init(e); ret = sscanf(str, "tfd: %d events: %x data: %llx" " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); if (ret < 3 || ret > 6) { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } else if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; } else if (ret == 6) { e->has_dev = true; e->has_inode = true; e->has_pos = true; } else if (ret < 6) { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } i = epfe->n_tfd++; if (xrealloc_safe(&epfe->tfd, epfe->n_tfd * sizeof(EventpollTfdEntry *))) goto out; epfe->tfd[i] = e; entry_met = true; continue; } if (fdinfo_field(str, "sigmask")) { SignalfdEntry *sfd = arg; if (type != FD_TYPES__SIGNALFD) goto parse_err; ret = sscanf(str, "sigmask: %llx", (unsigned long long *)&sfd->sigmask); if (ret != 1) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "fanotify flags")) { FanotifyFileEntry *fe = arg; if (type != FD_TYPES__FANOTIFY) goto parse_err; ret = sscanf(str, "fanotify flags:%x event-flags:%x", &fe->faflags, &fe->evflags); if (ret != 2) goto parse_err; entry_met = true; continue; } if (fdinfo_field(str, "fanotify ino")) { void *buf, *ob; FanotifyFileEntry *fe = arg; FanotifyMarkEntry *me; int hoff = 0, i; if (type != FD_TYPES__FANOTIFY) goto parse_err; ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + sizeof(FanotifyInodeMarkEntry) + sizeof(FhEntry) + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!buf) goto out; me = xptr_pull(&buf, FanotifyMarkEntry); fanotify_mark_entry__init(me); me->ie = xptr_pull(&buf, FanotifyInodeMarkEntry); fanotify_inode_mark_entry__init(me->ie); me->ie->f_handle = xptr_pull(&buf, FhEntry); fh_entry__init(me->ie->f_handle); me->ie->f_handle->n_handle = FH_ENTRY_SIZES__min_entries; me->ie->f_handle->handle = xptr_pull_s(&buf, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); ret = sscanf(str, "fanotify ino:%" PRIx64 " sdev:%x mflags:%x mask:%x ignored_mask:%x " "fhandle-bytes:%x fhandle-type:%x f_handle: %n", &me->ie->i_ino, &me->s_dev, &me->mflags, &me->mask, &me->ignored_mask, &me->ie->f_handle->bytes, &me->ie->f_handle->type, &hoff); if (ret != 7 || hoff == 0) { xfree(ob); goto parse_err; } parse_fhandle_encoded(str + hoff, me->ie->f_handle); me->type = MARK_TYPE__INODE; i = fe->n_mark++; if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) { xfree(ob); goto out; } fe->mark[i] = me; entry_met = true; continue; } if (fdinfo_field(str, "fanotify mnt_id")) { void *buf, *ob; FanotifyFileEntry *fe = arg; FanotifyMarkEntry *me; int i; if (type != FD_TYPES__FANOTIFY) goto parse_err; ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + sizeof(FanotifyMountMarkEntry)); if (!buf) goto out; me = xptr_pull(&buf, FanotifyMarkEntry); fanotify_mark_entry__init(me); me->me = xptr_pull(&buf, FanotifyMountMarkEntry); fanotify_mount_mark_entry__init(me->me); ret = sscanf(str, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x", &me->me->mnt_id, &me->mflags, &me->mask, &me->ignored_mask); if (ret != 4) { xfree(ob); goto parse_err; } me->type = MARK_TYPE__MOUNT; i = fe->n_mark++; if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) { xfree(ob); goto out; } fe->mark[i] = me; entry_met = true; continue; } if (fdinfo_field(str, "inotify wd")) { void *buf, *ob; InotifyFileEntry *ie = arg; InotifyWdEntry *ify; int hoff, i; if (type != FD_TYPES__INOTIFY) goto parse_err; ob = buf = xmalloc(sizeof(InotifyWdEntry) + sizeof(FhEntry) + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); if (!buf) goto out; ify = xptr_pull(&buf, InotifyWdEntry); inotify_wd_entry__init(ify); ify->f_handle = xptr_pull(&buf, FhEntry); fh_entry__init(ify->f_handle); ify->f_handle->n_handle = FH_ENTRY_SIZES__min_entries; ify->f_handle->handle = xptr_pull_s(&buf, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); ret = sscanf(str, "inotify wd:%x ino:%" PRIx64 " sdev:%x " "mask:%x ignored_mask:%x " "fhandle-bytes:%x fhandle-type:%x " "f_handle: %n", &ify->wd, &ify->i_ino, &ify->s_dev, &ify->mask, &ify->ignored_mask, &ify->f_handle->bytes, &ify->f_handle->type, &hoff); if (ret != 7) { xfree(ob); goto parse_err; } parse_fhandle_encoded(str + hoff, ify->f_handle); i = ie->n_wd++; if (xrealloc_safe(&ie->wd, ie->n_wd * sizeof(InotifyWdEntry *))) { xfree(ob); goto out; } ie->wd[i] = ify; entry_met = true; continue; } if (fdinfo_field(str, "map_type")) { BpfmapFileEntry *bpf = arg; if (type != FD_TYPES__BPFMAP) goto parse_err; ret = parse_bpfmap(&f, str, bpf); if (ret) goto parse_err; entry_met = true; continue; } } exit_code = 0; if (entry_met) goto out; /* * An eventpoll/inotify file may have no target fds set thus * resulting in no tfd: lines in proc. This is normal. */ if (type == FD_TYPES__EVENTPOLL || type == FD_TYPES__INOTIFY) goto out; pr_err("No records of type %d found in fdinfo file\n", type); parse_err: exit_code = -1; pr_perror("%s: error parsing [%s] for %d", __func__, str, type); out: bclose(&f); return exit_code; } int parse_fdinfo_pid(int pid, int fd, int type, void *arg) { return parse_fdinfo_pid_s(pid, fd, type, arg); } int parse_fdinfo(int fd, int type, void *arg) { return parse_fdinfo_pid_s(PROC_SELF, fd, type, arg); } int get_fd_mntid(int fd, int *mnt_id) { struct fdinfo_common fdinfo = { .mnt_id = -1 }; if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo)) return -1; *mnt_id = fdinfo.mnt_id; return 0; } static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) { int num; char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } if (num < 10) { pr_err("Invalid file lock info (%d): %s\n", num, buf); return -1; } if (!strcmp(fl_flag, "POSIX")) fl->fl_kind = FL_POSIX; else if (!strcmp(fl_flag, "FLOCK")) fl->fl_kind = FL_FLOCK; else if (!strcmp(fl_flag, "OFDLCK")) fl->fl_kind = FL_OFD; else if (!strcmp(fl_flag, "LEASE")) fl->fl_kind = FL_LEASE; else fl->fl_kind = FL_UNKNOWN; if (fl->fl_kind == FL_LEASE && !strcmp(fl_type, "BREAKING")) { fl->fl_ltype |= LEASE_BREAKING; } if (!strcmp(fl_type, "MSNFS")) { fl->fl_ltype |= LOCK_MAND; if (!strcmp(fl_option, "READ")) { fl->fl_ltype |= LOCK_READ; } else if (!strcmp(fl_option, "RW")) { fl->fl_ltype |= LOCK_RW; } else if (!strcmp(fl_option, "WRITE")) { fl->fl_ltype |= LOCK_WRITE; } else { pr_err("Unknown lock option!\n"); return -1; } } else { if (!strcmp(fl_option, "UNLCK")) { fl->fl_ltype |= F_UNLCK; } else if (!strcmp(fl_option, "WRITE")) { fl->fl_ltype |= F_WRLCK; } else if (!strcmp(fl_option, "READ")) { fl->fl_ltype |= F_RDLCK; } else { pr_err("Unknown lock option!\n"); return -1; } } return 0; } static bool pid_in_pstree(pid_t pid) { return pstree_item_by_real(pid) != NULL; } int parse_file_locks(void) { struct file_lock *fl; FILE *fl_locks; int exit_code = -1; bool is_blocked; if (kdat.has_fdinfo_lock) return 0; fl_locks = fopen_proc(PROC_GEN, "locks"); if (!fl_locks) return -1; while (fgets(buf, BUF_SIZE, fl_locks)) { is_blocked = strstr(buf, "->") != NULL; fl = alloc_file_lock(); if (!fl) { pr_perror("Alloc file lock failed!"); goto err; } if (parse_file_lock_buf(buf, fl, is_blocked)) { xfree(fl); goto err; } pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", fl->fl_id, fl->fl_kind, fl->fl_ltype, fl->fl_owner, fl->maj, fl->min, fl->i_no, fl->start, fl->end); if (fl->fl_kind == FL_UNKNOWN) { pr_err("Unknown file lock: %s!\n", buf); xfree(fl); goto err; } if (is_blocked) { /* * All target processes are stopped in this moment and * can't wait any locks. */ pr_debug("Skip blocked processes\n"); xfree(fl); continue; } if ((fl->fl_kind == FL_POSIX) && !pid_in_pstree(fl->fl_owner)) { /* * We only care about tasks which are taken * into dump, so we only collect file locks * belong to these tasks. */ xfree(fl); continue; } list_add_tail(&fl->list, &file_lock_list); } exit_code = 0; err: fclose(fl_locks); return exit_code; } void free_posix_timers(struct proc_posix_timers_stat *st) { while (!list_empty(&st->timers)) { struct proc_posix_timer *timer; timer = list_first_entry(&st->timers, struct proc_posix_timer, list); list_del(&timer->list); xfree(timer); } } int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args) { int exit_code = -1; int pid_t; int i = 0; struct bfd f; char *s; char sigpid[7]; char tidpid[4]; struct proc_posix_timer *timer = NULL; INIT_LIST_HEAD(&args->timers); args->timer_n = 0; f.fd = open_proc(pid, "timers"); if (f.fd < 0) return -1; if (bfdopenr(&f)) return -1; while (1) { char pbuf[17]; /* 16 + eol */ s = breadline(&f); if (!s) break; if (IS_ERR(s)) goto err; switch (i % 4) { case 0: timer = xzalloc(sizeof(struct proc_posix_timer)); if (timer == NULL) goto err; if (sscanf(s, "ID: %ld", &timer->spt.it_id) != 1) goto err; break; case 1: if (sscanf(s, "signal: %d/%16s", &timer->spt.si_signo, pbuf) != 2) goto err; break; case 2: if (sscanf(s, "notify: %6[a-z]/%3[a-z].%d\n", sigpid, tidpid, &pid_t) != 3) goto err; break; case 3: if (sscanf(s, "ClockID: %d\n", &timer->spt.clock_id) != 1) goto err; timer->spt.sival_ptr = NULL; if (sscanf(pbuf, "%p", &timer->spt.sival_ptr) != 1 && strcmp(pbuf, "(null)")) { pr_err("Unable to parse '%s'\n", pbuf); goto err; } if (tidpid[0] == 't') { timer->spt.it_sigev_notify = SIGEV_THREAD_ID; timer->spt.notify_thread_id = pid_t; } else { switch (sigpid[0]) { case 's': timer->spt.it_sigev_notify = SIGEV_SIGNAL; break; case 't': timer->spt.it_sigev_notify = SIGEV_THREAD; break; default: timer->spt.it_sigev_notify = SIGEV_NONE; break; } } list_add(&timer->list, &args->timers); timer = NULL; args->timer_n++; break; } i++; } exit_code = 0; out: bclose(&f); return exit_code; err: xfree(timer); free_posix_timers(args); pr_perror("Parse error in posix timers proc file!"); goto out; } int parse_threads(int pid, struct pid **_t, int *_n) { struct dirent *de; DIR *dir; struct pid *t = NULL; int nr = 1; if (*_t) t = *_t; dir = opendir_proc(pid, "task"); if (!dir) return -1; while ((de = readdir(dir))) { struct pid *tmp; /* We expect numbers only here */ if (de->d_name[0] == '.') continue; if (*_t == NULL) { tmp = xrealloc(t, nr * sizeof(struct pid)); if (!tmp) { xfree(t); closedir(dir); return -1; } t = tmp; t[nr - 1].ns[0].virt = -1; } t[nr - 1].real = atoi(de->d_name); t[nr - 1].state = TASK_THREAD; nr++; } closedir(dir); if (*_t == NULL) { *_t = t; *_n = nr - 1; } else BUG_ON(nr - 1 != *_n); return 0; } int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n) { while (fgets(buf, BUF_SIZE, f)) { struct cg_ctl *ncc, *cc; char *name, *path = NULL, *e; ncc = xmalloc(sizeof(*cc)); if (!ncc) goto err; /* * Typical output (':' is a separator here) * * 4:cpu,cpuacct:/ * 3:cpuset:/ * 2:name=systemd:/user.slice/user-1000.slice/session-1.scope */ name = strchr(buf, ':'); if (!name) { pr_err("Failed parsing cgroup %s\n", buf); xfree(ncc); goto err; } path = strchr(++name, ':'); if (!path) { pr_err("Failed parsing cgroup %s\n", buf); xfree(ncc); goto err; } e = strchr(name, '\n'); *path++ = '\0'; if (e) *e = '\0'; /* * Controllers and their props might be * configured the way some of them are * not taken into the image for migration * sake or container specifics. */ if (cgp_should_skip_controller(name)) { pr_debug("cg-prop: Skipping controller %s\n", name); xfree(ncc); continue; } ncc->name = xstrdup(name); ncc->path = xstrdup(path); ncc->cgns_prefix = 0; if (!ncc->name || !ncc->path) { xfree(ncc->name); xfree(ncc->path); xfree(ncc); goto err; } list_for_each_entry(cc, retl, l) if (strcmp(cc->name, name) >= 0) break; list_add_tail(&ncc->l, &cc->l); (*n)++; } return 0; err: put_ctls(retl); return -1; } int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) { FILE *f; int ret; LIST_HEAD(internal); unsigned int n_internal = 0; struct cg_ctl *intern, *ext; f = fopen_proc(pid, "cgroup"); if (!f) return -1; ret = parse_cgroup_file(f, retl, n); fclose(f); if (ret < 0) return -1; /* No parasite args, we're dumping criu's cg set, so we don't need to * try and parse the "internal" cgroup set to find namespace * boundaries. */ if (!args) return 0; f = fmemopen(args->contents, strlen(args->contents), "r"); if (!f) { pr_perror("couldn't fmemopen cgroup buffer %s", args->contents); return -1; } ret = parse_cgroup_file(f, &internal, &n_internal); fclose(f); if (ret < 0) { pr_err("couldn't parse internal cgroup file\n"); return -1; } /* Here's where we actually compute the cgns prefix. Consider a task * in /foo/bar which has unshared its namespace at /foo. The internal * path is /bar, but the external path is /foo/bar, and the cgns * prefix is /foo. The algorithm is: * * // no cg ns unshare in this case * if (internal == external) * continue; * idx = find_suffix_pos(external, internal) * cgns_prefix = external[:idx] */ list_for_each_entry(intern, &internal, l) { list_for_each_entry(ext, retl, l) { char *pos; if (strcmp(ext->name, intern->name)) continue; /* If the cgroup namespace was unshared at / (or there * is no cgroup namespace relative to criu), the paths * are equal and we don't need to set a prefix. */ if (!strcmp(ext->path, intern->path)) continue; /* +1 here to chop off the leading / */ pos = ext->path + strlen(ext->path) - strlen(intern->path + 1); if (strcmp(pos, intern->path + 1)) { pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path); ret = -1; goto out; } ext->cgns_prefix = pos - ext->path; if (ext->path[ext->cgns_prefix - 1] == '/') ext->cgns_prefix--; } } out: put_ctls(&internal); return ret; } void put_ctls(struct list_head *l) { struct cg_ctl *c, *n; list_for_each_entry_safe(c, n, l, l) { xfree(c->name); xfree(c->path); xfree(c); } INIT_LIST_HEAD(l); } /* Parse and create all the real controllers. This does not include things with * the "name=" prefix, e.g. systemd. */ int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups) { int exit_code = -1; FILE *f; f = fopen_proc(PROC_SELF, "cgroup"); if (f == NULL) return -1; while (fgets(buf, BUF_SIZE, f)) { struct cg_controller *nc = NULL; char *controllers, *off; controllers = strchr(buf, ':'); if (!controllers) { pr_err("Unable to parse \"%s\"\n", buf); goto err; } controllers++; off = strchr(controllers, ':'); if (!off) { pr_err("Unable to parse \"%s\"\n", buf); goto err; } *off = '\0'; if (cgp_should_skip_controller(controllers)) { pr_debug("cg-prop: Skipping controller %s\n", controllers); continue; } while (1) { off = strchr(controllers, ','); if (off) *off = '\0'; if (!strncmp("name=", controllers, 5)) goto skip; if (!nc) { nc = new_controller(controllers); if (!nc) goto err; list_add_tail(&nc->l, cgroups); (*n_cgroups)++; } else { void *m; char *n; nc->n_controllers++; m = xrealloc(nc->controllers, sizeof(char *) * nc->n_controllers); if (!m) goto err; nc->controllers = m; n = xstrdup(controllers); if (!n) goto err; nc->controllers[nc->n_controllers - 1] = n; } skip: if (!off) break; controllers = off + 1; } } exit_code = 0; err: fclose(f); return exit_code; } /* * If an OverlayFS mountpoint is found in the mountinfo table, * we enable opts.overlayfs, which is a workaround for the * OverlayFS Kernel bug. * * See fixup_overlayfs for details. */ int overlayfs_parse(struct mount_info *new) { opts.overlayfs = true; return 0; } /* * AUFS callback function to "fix up" the root pathname. * See sysfs_parse.c for details. */ int aufs_parse(struct mount_info *new) { int ret = 0; if (!strcmp(new->ns_mountpoint, "./")) { opts.aufs = true; ret = parse_aufs_branches(new); } return ret; } int parse_children(pid_t pid, pid_t **_c, int *_n) { pid_t *ch = NULL; int nr = 0; DIR *dir; struct dirent *de; struct bfd f; dir = opendir_proc(pid, "task"); if (dir == NULL) return -1; while ((de = readdir(dir))) { char *pos, *end; if (dir_dots(de)) continue; f.fd = open_proc(pid, "task/%s/children", de->d_name); if (f.fd < 0) goto err; if (bfdopenr(&f)) goto err; while (1) { pid_t val, *tmp; pos = breadchr(&f, ' '); if (IS_ERR(pos)) goto err_close; if (pos == NULL) break; val = strtol(pos, &end, 0); if (*end != 0 && *end != ' ') { pr_err("Unable to parse %s\n", end); goto err_close; } tmp = xrealloc(ch, (nr + 1) * sizeof(pid_t)); if (!tmp) goto err_close; ch = tmp; ch[nr] = val; nr++; } bclose(&f); } *_c = ch; *_n = nr; closedir(dir); return 0; err_close: bclose(&f); err: closedir(dir); xfree(ch); return -1; } #define CSEC_PER_SEC 100 int parse_uptime(uint64_t *upt) { unsigned long sec, csec; FILE *f; f = fopen("/proc/uptime", "r"); if (!f) { pr_perror("Failed to fopen /proc/uptime"); return -1; } if (fscanf(f, "%lu.%2lu", &sec, &csec) != 2) { pr_perror("Failed to parse /proc/uptime"); fclose(f); return -1; } *upt = sec * USEC_PER_SEC + csec * (USEC_PER_SEC / CSEC_PER_SEC); fclose(f); return 0; }