diff options
author | Cyrill Gorcunov <gorcunov@gmail.com> | 2011-09-23 12:00:45 +0400 |
---|---|---|
committer | Cyrill Gorcunov <gorcunov@gmail.com> | 2011-09-23 12:00:45 +0400 |
commit | 523de236244946a0de127dfc9954369963819ef7 (patch) | |
tree | b6001e027216b31c278d2ab15ef72ce7d58c3c9a |
Initial commitinit
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
53 files changed, 12644 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..1a537e27d --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +*.o +*.d +*.img +*.bin +*.elf +*.out +cscope* +tags +TAGS diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..8972f07d7 --- /dev/null +++ b/Makefile @@ -0,0 +1,171 @@ +ifeq ($(strip $(V)),) + E = @echo + Q = @ +else + E = @\# + Q = +endif +export E Q + +FIND := find +CSCOPE := cscope +TAGS := ctags +RM := rm +LD := ld +HEXDUMP := hexdump +CC := gcc +ECHO := echo +NM := nm +AWK := awk +SH := sh + +CFLAGS += -I./include +CFLAGS += -O0 -ggdb3 + +LIBS += -lrt + +# Additional ARCH settings for x86 +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ -e s/sa110/arm/ \ + -e s/s390x/s390/ -e s/parisc64/parisc/ \ + -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ ) + +uname_M := $(shell uname -m | sed -e s/i.86/i386/) +ifeq ($(uname_M),i386) + ARCH := x86 + DEFINES += -DCONFIG_X86_32 +endif +ifeq ($(uname_M),x86_64) + ARCH := x86 + DEFINES += -DCONFIG_X86_64 +endif + +DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_GNU_SOURCE + +ifneq ($(WERROR),0) + WARNINGS += -Werror +endif + +WARNINGS += -Wall -Wno-unused +CFLAGS += $(WARNINGS) $(DEFINES) + +PROGRAM := crtools +TESTEE := testee +TESTEE-TH := testee-threads +TESTEE-STATIC := testee-static + +all: $(PROGRAM) $(TESTEE) $(TESTEE-TH) $(TESTEE-STATIC) + +OBJS += crtools.o +OBJS += parasite-syscall.o +OBJS += cr-dump.o +OBJS += cr-restore.o +OBJS += cr-show.o +OBJS += util.o +OBJS += rbtree.o +OBJS += elf.o + +OBJS-TESTEE += testee.o + +OBJS-TESTEE-TH += testee-threads.o + +OBJS-BLOB += parasite.o + +DEPS := $(patsubst %.o,%.d,$(OBJS)) +DEPS-TESTEE := $(patsubst %.o,%.d,$(OBJS-TESTEE)) +DEPS-TESTEE-TH := $(patsubst %.o,%.d,$(OBJS-TESTEE-TH)) +DEPS-BLOB := $(patsubst %.o,%.d,$(OBJS-BLOB)) + +SRCS-BLOB += $(patsubst %.o,%.c,$(OBJS-BLOB)) + +HEAD-BLOB := $(patsubst %.o,%.h,$(OBJS-BLOB)) +HEAD-BLOB-GEN := $(patsubst %.o,%-blob.h,$(OBJS-BLOB)) +HEAD-BIN := $(patsubst %.o,%.bin,$(OBJS-BLOB)) +HEAD-LDS := $(patsubst %.o,%.lds.S,$(OBJS-BLOB)) + +HEAD-IDS := $(patsubst %.h,%_h__,$(subst -,_,$(HEAD-BLOB))) + +$(OBJS-BLOB): $(SRCS-BLOB) $(DEPS-BLOB) + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) -fpic $< -o $@ + +$(HEAD-BIN): $(OBJS-BLOB) $(HEAD-LDS) +%.bin: %.o + $(E) " GEN " $@ + $(Q) $(LD) -T $(patsubst %.bin,%.lds.S,$@) $< -o $@ + $(Q) $(LD) -T $(patsubst %.bin,%-elf.lds.S,$@) $< -o $@.o + +$(HEAD-BLOB): $(DEPS-BLOB) $(HEAD-BIN) +%-blob.h: %.bin +%.h: %.bin + $(E) " GEN " $@ + $(Q) $(SH) gen-offsets.sh \ + $(subst -,_,$(patsubst %.h,%,$@))_h__ \ + $(subst -,_,$(patsubst %.h,%,$@))_blob_offset__ \ + $(subst -,_,$(patsubst %.h,%,$@))_blob \ + $(patsubst %.h,%.o,$@) \ + $(patsubst %.h,%.bin,$@) > $(patsubst %.h,%-blob.h,$@) + +$(OBJS): $(HEAD-BLOB) $(DEPS) +$(OBJS-TESTEE): $(DEPS-TESTEE) +$(OBJS-TESTEE-TH): $(DEPS-TESTEE-TH) +%.o: %.c + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) $< -o $@ + +$(PROGRAM): $(OBJS) + $(E) " LINK " $@ + $(Q) $(CC) $(OBJS) $(LIBS) -o $@ + +$(TESTEE): $(OBJS-TESTEE) + $(E) " LINK " $@ + $(Q) $(CC) $(OBJS-TESTEE) -o $@ + +$(TESTEE-TH): $(OBJS-TESTEE-TH) + $(E) " LINK " $@ + $(Q) $(CC) $(OBJS-TESTEE-TH) -lpthread -o $@ + +$(TESTEE-STATIC).o: testee-static.c + $(Q) gcc -c -static -I./.include -o testee-static.o testee-static.c + +$(TESTEE-STATIC): $(TESTEE-STATIC).o + $(Q) gcc -o testee-static -static testee-static.o + +$(DEPS): +$(DEPS-TESTEE): +$(DEPS-TESTEE-TH): +$(DEPS-BLOB): +%.d: %.c + $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@ + +clean: + $(E) " CLEAN" + $(Q) rm -f ./*.o + $(Q) rm -f ./*.d + $(Q) rm -f ./*.img + $(Q) rm -f ./*.elf + $(Q) rm -f ./*.out + $(Q) rm -f ./*.bin + $(Q) rm -f ./tags + $(Q) rm -f ./cscope* + $(Q) rm -f ./$(PROGRAM) + $(Q) rm -f ./$(TESTEE) + $(Q) rm -f ./$(TESTEE-STATIC) + $(Q) rm -f ./$(TESTEE-TH) + $(Q) rm -f ./$(HEAD-BLOB) + $(Q) rm -f ./$(HEAD-BLOB-GEN) +.PHONY: clean + +tags: + $(E) " GEN" $@ + $(Q) $(RM) -f tags + $(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a +.PHONY: tags + +cscope: + $(E) " GEN" $@ + $(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files + $(Q) $(CSCOPE) -bkqu +.PHONY: cscope @@ -0,0 +1,15 @@ +crtools +======= + +An utility to to checkpoint/restore tasks. + +Some code is borrowed from + + - Linux kernel (http://kernel.org/) + - git (http://git-scm.com/) + - kvm-tools (https://github.com/penberg/linux-kvm.git) + - ptrace-parasite (https://code.google.com/p/ptrace-parasite/) + +Many thanks to these projects. + +Licensed under GPLv2 diff --git a/cr-dump.c b/cr-dump.c new file mode 100644 index 000000000..57036d105 --- /dev/null +++ b/cr-dump.c @@ -0,0 +1,977 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <dirent.h> + +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include <sys/sendfile.h> + +#include "types.h" +#include "list.h" + +#include "compiler.h" +#include "crtools.h" +#include "syscall.h" +#include "util.h" + +#include "image.h" + +#include "parasite.h" +#include "parasite-syscall.h" +#include "parasite-blob.h" + +#ifndef CONFIG_X86_64 +# error No x86-32 support yet +#endif + +static LIST_HEAD(vma_area_list); +static LIST_HEAD(pstree_list); + +static char big_buffer[PATH_MAX]; +static struct parasite_ctl *parasite_ctl; + +static char loc_buf[PAGE_SIZE]; + +static void free_pstree(void) +{ + struct pstree_item *item, *p; + + list_for_each_entry_safe(item, p, &pstree_list, list) { + xfree(item->children); + xfree(item); + } + + INIT_LIST_HEAD(&pstree_list); +} + +static void free_mappings(void) +{ + struct vma_area *vma_area, *p; + + list_for_each_entry_safe(vma_area, p, &vma_area_list, list) { + if (vma_area->vm_file_fd > 0) + close(vma_area->vm_file_fd); + free(vma_area); + } + + INIT_LIST_HEAD(&vma_area_list); +} + +static int collect_mappings(pid_t pid) +{ + struct vma_area *vma_area; + int ret = -1; + + pr_info("\n"); + pr_info("Collecting mappings (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + ret = parse_maps(pid, &vma_area_list); + if (ret) + goto err; + + pr_info_vma_list(&vma_area_list); + + pr_info("----------------------------------------\n"); + +err: + return ret; + +err_bogus_mapping: + pr_error("Bogus mapping %lx-%lx\n", + vma_area->vma.start, + vma_area->vma.end); + goto err; +} + +static int dump_one_reg_file(int type, unsigned long fd_name, int lfd, + bool do_close, unsigned long pos, unsigned int flags, + struct cr_fdset *cr_fdset) +{ + struct fdinfo_entry e; + char fd_str[128]; + int len; + int ret = -1; + + snprintf(fd_str, sizeof(fd_str), "/proc/self/fd/%d", lfd); + len = readlink(fd_str, big_buffer, sizeof(big_buffer) - 1); + if (len < 0) { + pr_perror("Can't readlink %s\n", fd_str); + goto err; + } + + big_buffer[len] = '\0'; + pr_info("Dumping path for %lx fd via self %d [%s]\n", + fd_name, lfd, big_buffer); + + if (do_close) + close(lfd); + + e.type = type; + e.len = len; + e.flags = flags; + e.pos = pos; + e.addr = fd_name; + + pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8x addr: %16lx\n", + type, len, flags, pos, fd_name); + + write_ptr_safe(cr_fdset->desc[CR_FD_FDINFO].fd, &e, err); + write_safe(cr_fdset->desc[CR_FD_FDINFO].fd, big_buffer, e.len, err); + + ret = 0; +err: + return ret; +} + +static int dump_pipe_and_data(int lfd, struct pipe_entry *e, + struct cr_fdset *cr_fdset) +{ + int fd_pipes; + int steal_pipe[2]; + int pipe_size; + int has_bytes; + int ret = -1; + + fd_pipes = cr_fdset->desc[CR_FD_PIPES].fd; + + pr_info("Dumping data from pipe %x\n", e->pipeid); + if (pipe(steal_pipe) < 0) { + pr_perror("Can't create pipe for stealing data\n"); + goto err; + } + + pipe_size = fcntl(lfd, F_GETPIPE_SZ); + if (pipe_size < 0) { + pr_error("Can't obtain piped data size\n"); + goto err; + } + + has_bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); + if (has_bytes < 0) { + if (errno != EAGAIN) { + pr_perror("Can't pick pipe data\n"); + goto err_close; + } else + has_bytes = 0; + } + + e->bytes = has_bytes; + write_ptr_safe(fd_pipes, e, err_close); + + if (has_bytes) { + ret = splice(steal_pipe[0], NULL, fd_pipes, + NULL, has_bytes, 0); + if (ret < 0) { + pr_perror("Can't push pipe data\n"); + goto err_close; + } + } + + ret = 0; + +err_close: + close(steal_pipe[0]); + close(steal_pipe[1]); + +err: + return ret; +} + +static int dump_one_pipe(int fd, int lfd, unsigned int id, unsigned int flags, + struct cr_fdset *cr_fdset) +{ + struct pipe_entry e; + int ret = -1; + + pr_info("Dumping pipe %d/%x flags %x\n", fd, id, flags); + + e.fd = fd; + e.pipeid = id; + e.flags = flags; + + if (flags & O_WRONLY) { + e.bytes = 0; + write_ptr_safe(cr_fdset->desc[CR_FD_PIPES].fd, &e, err); + ret = 0; + } else + ret = dump_pipe_and_data(lfd, &e, cr_fdset); + +err: + if (!ret) + pr_info("Dumped pipe: fd: %8lx pipeid: %8lx flags: %8lx bytes: %8lx\n", + e.fd, e.pipeid, e.flags, e.bytes); + else + pr_error("Dumping pipe %d/%x flags %x\n", fd, id, flags); + + return ret; +} + +static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long pos, + unsigned int flags, struct cr_fdset *cr_fdset) +{ + struct statfs stfs_buf; + struct stat st_buf; + int fd; + + fd = openat(dir, fd_name, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to openat %s/%d %s\n", pid_fd_dir, dir, fd_name); + return -1; + } + + if (fstat(fd, &st_buf) < 0) { + pr_perror("Can't get stat on %s\n", fd_name); + return -1; + } + + if (S_ISREG(st_buf.st_mode)) + return dump_one_reg_file(FDINFO_FD, atol(fd_name), + fd, 1, pos, flags, cr_fdset); + + if (S_ISFIFO(st_buf.st_mode)) { + if (fstatfs(fd, &stfs_buf) < 0) { + pr_perror("Can't fstatfs on %s\n", fd_name); + return -1; + } + + if (stfs_buf.f_type == PIPEFS_MAGIC) + return dump_one_pipe(atol(fd_name), fd, + st_buf.st_ino, flags, cr_fdset); + } + + if (!strcmp(fd_name, "0")) { + pr_info("... Skipping stdin ...\n"); + return 0; + } + + if (!strcmp(fd_name, "1")) { + pr_info("... Skipping stdout ...\n"); + return 0; + } + + if (!strcmp(fd_name, "2")) { + pr_info("... Skipping stderr ...\n"); + return 0; + } + + if (!strcmp(fd_name, "3")) { + pr_info("... Skipping tty ...\n"); + return 0; + } + + pr_error("Can't dump file %s of that type [%x]\n", fd_name, st_buf.st_mode); + return 1; +} + +static int read_fd_params(pid_t pid, char *fd, unsigned long *pos, unsigned int *flags) +{ + char fd_str[128]; + int ifd; + + snprintf(fd_str, sizeof(fd_str), "/proc/%d/fdinfo/%s", pid, fd); + + ifd = open(fd_str, O_RDONLY); + if (ifd < 0) { + pr_perror("Can't open %s\n", fd_str); + return -1; + } + + read(ifd, big_buffer, sizeof(big_buffer)); + close(ifd); + + sscanf(big_buffer, "pos:\t%li\nflags:\t%o\n", pos, flags); + + pr_info("%s: pos: %16lx flags: %16lx\n", fd_str, *pos, *flags); + + return 0; +} + +static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset) +{ + char pid_fd_dir[64]; + struct dirent *de; + unsigned long pos; + unsigned int flags; + DIR *fd_dir; + + pr_info("\n"); + pr_info("Dumping opened files (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + snprintf(pid_fd_dir, sizeof(pid_fd_dir), "/proc/%d/fd", pid); + fd_dir = opendir(pid_fd_dir); + if (!fd_dir) { + pr_perror("Can't open %s\n", pid_fd_dir); + return -1; + } + + while ((de = readdir(fd_dir))) { + if (de->d_name[0] == '.') + continue; + if (read_fd_params(pid, de->d_name, &pos, &flags)) + return -1; + if (dump_one_fd(pid_fd_dir, dirfd(fd_dir), de->d_name, pos, flags, cr_fdset)) + return -1; + } + + pr_info("----------------------------------------\n"); + + closedir(fd_dir); + return 0; +} + +static int dump_task_mappings(pid_t pid, struct cr_fdset *cr_fdset) +{ + struct vma_area *vma_area; + int ret = -1; + + pr_info("\n"); + pr_info("Dumping mappings (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + list_for_each_entry(vma_area, &vma_area_list, list) { + + struct vma_entry *vma = &vma_area->vma; + + if (!(vma->status & VMA_AREA_REGULAR)) + continue; + + pr_info_vma(vma_area); + + switch (vma->flags) { + case MAP_SHARED: + case MAP_PRIVATE: + + if ((vma->status & VMA_ANON_SHARED)) { + struct shmem_entry e; + + e.start = vma->start; + e.end = vma->end; + e.shmid = vma_area->shmid; + + pr_info("shmem: s: %16lx e: %16lx shmid: %16lx\n", + e.start, e.end, e.shmid); + + write_ptr_safe(cr_fdset->desc[CR_FD_SHMEM].fd, &e, err); + } else if ((vma->status & VMA_FILE_PRIVATE) || + (vma->status & VMA_FILE_SHARED)) { + + unsigned int flags; + + if (vma->prot & PROT_WRITE && (vma->status & VMA_FILE_SHARED)) + flags = O_RDWR; + else + flags = O_RDONLY; + + ret = dump_one_reg_file(FDINFO_MAP, + vma->start, + vma_area->vm_file_fd, + 0, 0, flags, + cr_fdset); + if (ret) + goto err; + } + break; + default: + pr_panic("Unknown VMA (pid: %d)\n", pid); + goto err; + break; + } + } + + ret = 0; + + pr_info("----------------------------------------\n"); + +err: + return ret; +} + +#define assign_reg(dst, src, e) dst.e = (__typeof__(dst.e))src.e +#define assign_array(dst, src, e) memcpy(&dst.e, &src.e, sizeof(dst.e)) + +static int get_task_personality(pid_t pid, u32 *personality) +{ + FILE *file = NULL; + int ret = -1; + + snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/personality", pid); + file = fopen(loc_buf, "r"); + if (!file) { + perror("Can't open task personality"); + goto err; + } + + if (!fgets(loc_buf, sizeof(loc_buf), file)) { + perror("Can't read task personality"); + goto err; + } + + *personality = atoi(loc_buf); + ret = 0; + +err: + if (file) + fclose(file); + return ret; +} + +static int dump_task_tls(pid_t pid, struct desc_struct *tls_array, int size) +{ + FILE *file = NULL; + int ret = -1; + + if (size != GDT_ENTRY_TLS_ENTRIES) { + pr_error("Wrong TLS storage size: %d\n", size); + goto err; + } + + snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/tls", pid); + file = fopen(loc_buf, "r"); + if (!file) { + perror("Can't open task tls"); + goto err; + } + + ret = 0; + while (fgets(loc_buf, sizeof(loc_buf), file)) { + u32 a, b; + if (sscanf(loc_buf, "%x %x", &a, &b) != 2) { + pr_error("Can't parse tls entry: %s\n"); + ret = -1; + goto err; + } + if (ret >= GDT_ENTRY_TLS_ENTRIES) { + pr_error("Too many entries in tls\n"); + ret = -1; + goto err; + } + tls_array[ret].a = a; + tls_array[ret].b = b; + + ret++; + } + + if (ret != GDT_ENTRY_TLS_ENTRIES) { + pr_error("tls returened %i entries instead of %i\n", + ret, GDT_ENTRY_TLS_ENTRIES); + ret = -1; + goto err; + } + + ret = 0; + +err: + if (file) + fclose(file); + return ret; +} + +static int dump_task_core_seized(pid_t pid, struct cr_fdset *cr_fdset) +{ + struct core_entry *core = xzalloc(sizeof(*core)); + user_fpregs_struct_t fpregs = {-1}; + user_regs_struct_t regs = {-1}; + int fd_core = cr_fdset->desc[CR_FD_CORE].fd; + int ret = -1; + + pr_info("\n"); + pr_info("Dumping core (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + if (!core) + goto err; + + lseek(fd_core, MAGIC_OFFSET, SEEK_SET); + + jerr(ptrace(PTRACE_GETREGS, pid, NULL, ®s), err_free); + jerr(ptrace(PTRACE_GETFPREGS, pid, NULL, &fpregs), err_free); + + pr_info("Dumping GP/FPU registers ... "); + + assign_reg(core->gpregs, regs, r15); + assign_reg(core->gpregs, regs, r14); + assign_reg(core->gpregs, regs, r13); + assign_reg(core->gpregs, regs, r12); + assign_reg(core->gpregs, regs, bp); + assign_reg(core->gpregs, regs, bx); + assign_reg(core->gpregs, regs, r11); + assign_reg(core->gpregs, regs, r10); + assign_reg(core->gpregs, regs, r9); + assign_reg(core->gpregs, regs, r8); + assign_reg(core->gpregs, regs, ax); + assign_reg(core->gpregs, regs, cx); + assign_reg(core->gpregs, regs, dx); + assign_reg(core->gpregs, regs, si); + assign_reg(core->gpregs, regs, di); + assign_reg(core->gpregs, regs, orig_ax); + assign_reg(core->gpregs, regs, ip); + assign_reg(core->gpregs, regs, cs); + assign_reg(core->gpregs, regs, flags); + assign_reg(core->gpregs, regs, sp); + assign_reg(core->gpregs, regs, ss); + assign_reg(core->gpregs, regs, fs_base); + assign_reg(core->gpregs, regs, gs_base); + assign_reg(core->gpregs, regs, ds); + assign_reg(core->gpregs, regs, es); + assign_reg(core->gpregs, regs, fs); + assign_reg(core->gpregs, regs, gs); + + assign_reg(core->fpregs, fpregs, cwd); + assign_reg(core->fpregs, fpregs, swd); + assign_reg(core->fpregs, fpregs, twd); + assign_reg(core->fpregs, fpregs, fop); + assign_reg(core->fpregs, fpregs, rip); + assign_reg(core->fpregs, fpregs, rdp); + assign_reg(core->fpregs, fpregs, mxcsr); + assign_reg(core->fpregs, fpregs, mxcsr_mask); + + assign_array(core->fpregs, fpregs, st_space); + assign_array(core->fpregs, fpregs, xmm_space); + assign_array(core->fpregs, fpregs, padding); + + pr_info("OK\n"); + + pr_info("Obtainting TLS ... "); + ret = dump_task_tls(pid, core->tls_array, ARRAY_SIZE(core->tls_array)); + if (ret) + goto err_free; + pr_info("OK\n"); + + pr_info("Obtainting personality ... "); + ret = get_task_personality(pid, &core->personality); + if (ret) + goto err_free; + pr_info("OK\n"); + + pr_info("Dumping header ... "); + core->hdr.version = HEADER_VERSION; + core->hdr.arch = HEADER_ARCH_X86_64; + core->hdr.flags = 0; + + write_ptr_safe(fd_core, core, err_free); + + pr_info("OK\n"); + ret = 0; + +err_free: + free(core); +err: + pr_info("----------------------------------------\n"); + + return ret; +} + +static struct pstree_item *find_children(pid_t pid) +{ + struct pstree_item *item = NULL; + u32 *children = NULL; + u32 nr_allocated = 0; + u32 nr_children = 0; + bool found = false; + FILE *file; + char *tok; + + pr_debug("pid: %d\n", pid); + + snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/status", pid); + file = fopen(loc_buf, "r"); + if (!file) { + perror("Can't open task status"); + goto err; + } + + while ((fgets(loc_buf, sizeof(loc_buf), file))) { + if (strncmp(loc_buf, "Children:", 9)) { + continue; + } else { + found = true; + break; + } + } + + fclose(file), file = NULL; + if (!found) { + pr_error("Children marker is not found\n"); + goto err; + } + + item = xzalloc(sizeof(*item)); + if (!item) + goto err; + + tok = strtok(&loc_buf[10], " \n"); + while (tok) { + u32 child_pid = atoi(tok); + + pr_debug("child_pid: %d\n", child_pid); + + if (nr_allocated <= nr_children) { + nr_allocated += 64; + if (xrealloc_safe((void **)&children, nr_allocated)) { + xfree(children); + xfree(item); + item = NULL; + goto err; + } + } + + children[nr_children++] = child_pid; + tok = strtok(NULL, " \n"); + } + + item->pid = pid; + item->nr_children = nr_children; + item->children = children; + +err: + return item; +} + +static int collect_pstree(pid_t pid) +{ + struct pstree_item *item; + unsigned long i; + int ret = -1; + + item = find_children(pid); + if (!item) + goto err; + + list_add_tail(&item->list, &pstree_list); + + for (i = 0; i < item->nr_children; i++) { + ret = collect_pstree(item->children[i]); + if (ret) + goto err; + } + ret = 0; + +err: + return ret; +} + +static int dump_pstree(pid_t pid, struct cr_fdset *cr_fdset) +{ + struct pstree_item *item; + struct pstree_entry e; + unsigned long i; + int ret = -1; + + pr_info("\n"); + pr_info("Dumping pstree (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + list_for_each_entry(item, &pstree_list, list) { + + pr_info("Process: %d (%d children)\n", + item->pid, item->nr_children); + + e.pid = item->pid; + e.nr_children = item->nr_children; + + write_ptr_safe(cr_fdset->desc[CR_FD_PSTREE].fd, &e, err); + + pr_info("Children:"); + for (i = 0; i < item->nr_children; i++) { + pr_info(" %d", item->children[i]); + write_ptr_safe(cr_fdset->desc[CR_FD_PSTREE].fd, + &item->children[i], err); + } + pr_info("\n"); + } + ret = 0; + +err: + pr_info("----------------------------------------\n"); + return ret; +} + +static struct vma_area *find_vma_by_addr(unsigned long addr) +{ + struct vma_area *vma_area; + + list_for_each_entry(vma_area, &vma_area_list, list) { + if (in_vma_area(vma_area, addr)) + return vma_area; + } + + return NULL; +} + +/* kernel expects a special format in core file */ +static int finalize_core(pid_t pid, struct cr_fdset *cr_fdset) +{ + int fd_pages, fd_pages_shmem, fd_core; + unsigned long num, num_anon; + struct vma_area *vma_area; + struct vma_entry ve; + int ret = -1; + u64 va; + + pr_info("\n"); + pr_info("Finalizing core (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + fd_core = cr_fdset->desc[CR_FD_CORE].fd; + fd_pages = cr_fdset->desc[CR_FD_PAGES].fd; + fd_pages_shmem = cr_fdset->desc[CR_FD_PAGES_SHMEM].fd; + + pr_debug("dsc: fd_core %d fd_pages %d fd_pages_shmem %d\n", + fd_core, fd_pages, fd_pages_shmem); + + lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET); + lseek(fd_pages, MAGIC_OFFSET, SEEK_SET); + lseek(fd_pages_shmem, MAGIC_OFFSET, SEEK_SET); + + num = 0; + pr_info("Appending VMAs ... "); + + /* All VMAs first */ + + list_for_each_entry(vma_area, &vma_area_list, list) { + ret = write(fd_core, &vma_area->vma, sizeof(vma_area->vma)); + if (ret != sizeof(vma_area->vma)) { + pr_perror("\nUnable to write vma entry (%li written)\n", num); + goto err; + } + num++; + } + + /* Ending marker */ + memset(&ve, 0, sizeof(ve)); + write_ptr_safe(fd_core, &ve, err); + + pr_info("OK (%li written)\n", num); + + num = 0; + num_anon = 0; + + pr_info("Appending pages ... "); + while (1) { + ret = read(fd_pages, &va, sizeof(va)); + if (!ret) + break; + if (ret != sizeof(va)) { + pr_perror("\nUnable to read VA of page (%li written)\n", num); + goto err; + } + + /* Ending marker */ + if (va == 0) { + write_ptr_safe(fd_core, &zero_page_entry, err); + write_ptr_safe(fd_pages_shmem, &zero_page_entry, err); + break; + } + + vma_area = find_vma_by_addr((unsigned long)va); + if (!vma_area) { + pr_panic("\nA page with address %lx is unknown\n", va); + goto err; + } + + /* + * Just in case if someone broke parasite page + * dumper code. + */ + if (!vma_area_has(vma_area, VMA_AREA_REGULAR)) { + pr_panic("\nA page with address %lx has a wrong status\n", va); + goto err; + } + + if (vma_area_has(vma_area, VMA_ANON_PRIVATE) || + vma_area_has(vma_area, VMA_FILE_PRIVATE)) { + ret = write(fd_core, &va, sizeof(va)); + ret += sendfile(fd_core, fd_pages, NULL, PAGE_SIZE); + if (ret != sizeof(va) + PAGE_SIZE) { + pr_perror("\nUnable to write VMA_FILE_PRIVATE|VMA_ANON_PRIVATE " + "page (%li, %li written)\n", + num, num_anon); + goto err; + } + num++; + } else if (vma_area_has(vma_area, VMA_ANON_SHARED)) { + ret = write(fd_pages_shmem, &va, sizeof(va)); + ret += sendfile(fd_pages_shmem, fd_pages, NULL, PAGE_SIZE); + if (ret != sizeof(va) + PAGE_SIZE) { + pr_perror("\nUnable to write VMA_ANON_SHARED " + "page (%li, %li written)\n", + num, num_anon); + goto err; + } + num_anon++; + } else { + /* skip the page */ + lseek(fd_pages, PAGE_SIZE, SEEK_CUR); + } + } + ret = 0; + + pr_info("OK (%li written)\n", num + num_anon); + +err: + pr_info("----------------------------------------\n"); + return ret; + +err_strno: + pr_perror("Error catched\n"); + goto err; +} + +static int dump_one_task(pid_t pid, struct cr_fdset *cr_fdset) +{ + int ret = 0; + + pr_info("========================================\n"); + pr_info("Dumping task (pid: %d)\n", pid); + pr_info("========================================\n"); + + ret = collect_mappings(pid); + if (ret) { + pr_error("Collect mappings (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + ret = seize_task(pid); + if (ret) { + pr_error("Failed to seize task (pid: %d) with %d\n", + pid, ret); + goto err; + } + + ret = dump_task_core_seized(pid, cr_fdset); + if (ret) { + pr_error("Dump core (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + parasite_ctl = parasite_infect_seized(pid, NULL, &vma_area_list); + if (!parasite_ctl) { + pr_error("Can't infect (pid: %d) with parasite\n", pid); + goto err; + } + + ret = parasite_dump_pages_seized(parasite_ctl, &vma_area_list, + cr_fdset, CR_FD_PAGES); + if (ret) { + pr_error("Can't dump pages (pid: %d) with parasite\n", pid); + goto err; + } + + ret = parasite_cure_seized(¶site_ctl, &vma_area_list); + if (ret) { + pr_error("Can't cure (pid: %d) from parasite\n", pid); + goto err; + } + + ret = unseize_task(pid); + if (ret) { + pr_error("Can't unsieze (pid: %d) task\n", pid); + goto err; + } + + ret = dump_task_files(pid, cr_fdset); + if (ret) { + pr_error("Dump files (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + ret = dump_task_mappings(pid, cr_fdset); + if (ret) { + pr_error("Dump mappings (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + ret = finalize_core(pid, cr_fdset); + if (ret) { + pr_error("Finalizing core (pid: %d) failed with %d\n", pid, ret); + goto err; + } + +err: + free_mappings(); + return ret; +} + +int cr_dump_tasks(pid_t pid, bool leader_only, int leave_stopped) +{ + struct cr_fdset *cr_fdset = NULL; + struct pstree_item *item; + int ret = -1; + + if (!leader_only) { + pr_info("========================================\n"); + pr_info("Dumping process group (pid: %d)\n", pid); + pr_info("========================================\n"); + } + + if (collect_pstree(pid)) + goto err; + + list_for_each_entry(item, &pstree_list, list) { + stop_task(item->pid); + if (leader_only) + break; + } + + /* Dump the process tree first */ + cr_fdset = alloc_cr_fdset(pid); + if (!cr_fdset) + goto err; + + if (prep_cr_fdset_for_dump(cr_fdset, CR_FD_DESC_USE(CR_FD_PSTREE))) + goto err; + if (dump_pstree(pid, cr_fdset)) + goto err; + + close_cr_fdset(cr_fdset); + free_cr_fdset(&cr_fdset); + + /* Now all other data */ + list_for_each_entry(item, &pstree_list, list) { + + cr_fdset = alloc_cr_fdset(item->pid); + if (!cr_fdset) + goto err; + if (prep_cr_fdset_for_dump(cr_fdset, CR_FD_DESC_NOPSTREE)) + goto err; + + if (dump_one_task(item->pid, cr_fdset)) + goto err; + + close_cr_fdset(cr_fdset); + free_cr_fdset(&cr_fdset); + + if (leader_only) + break; + } + ret = 0; + +err: + if (!leave_stopped) { + list_for_each_entry(item, &pstree_list, list) { + continue_task(item->pid); + if (leader_only) + break; + } + } + + free_pstree(); + close_cr_fdset(cr_fdset); + free_cr_fdset(&cr_fdset); + return ret; +} diff --git a/cr-restore.c b/cr-restore.c new file mode 100644 index 000000000..99f1ed1b7 --- /dev/null +++ b/cr-restore.c @@ -0,0 +1,1144 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <dirent.h> +#include <string.h> + +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include <sched.h> + +#include <sys/sendfile.h> + +#include "compiler.h" +#include "types.h" + +#include "image.h" +#include "util.h" + +#include "crtools.h" + +struct fmap_fd { + struct fmap_fd *next; + unsigned long start; + int fd; +}; + +struct shmem_info { + unsigned long start; + unsigned long end; + unsigned long id; + int pid; + int real_pid; +}; + +struct pipe_info { + unsigned int id; + int pid; + int real_pid; + int read_fd; + int write_fd; + int users; +}; + + +static struct fmap_fd *fmap_fds; + +static struct shmem_info *shmems; +static int nr_shmems; + +static struct pipe_info *pipes; +static int nr_pipes; + +static int restore_task_with_children(int my_pid, char *pstree_path); + +static void show_saved_shmems(void) +{ + int i; + + pr_info("\tSaved shmems:\n"); + + for (i = 0; i < nr_shmems; i++) + pr_info("\t\tstart: %016lx id: %lx pid: %d\n", + shmems[i].start, + shmems[i].id, + shmems[i].pid); +} + +static void show_saved_pipes(void) +{ + int i; + + pr_info("\tSaved pipes:\n"); + for (i = 0; i < nr_pipes; i++) + pr_info("\t\tid: %x -> pid: %d\n", + pipes[i].id, + pipes[i].pid); +} + +static struct shmem_info *search_shmem(unsigned long addr, unsigned long id) +{ + struct shmem_info *si; + int i; + + for (i = 0; i < nr_shmems; i++) { + si = shmems + i; + if (si->start <= addr && si->end >= addr && si->id == id) + return si; + } + + return NULL; +} + +static struct pipe_info *search_pipe(unsigned int pipeid) +{ + struct pipe_info *pi; + int i; + + for (i = 0; i < nr_pipes; i++) { + pi = pipes + i; + if (pi->id == pipeid) + return pi; + } + + return NULL; +} + +static void shmem_update_real_pid(int vpid, int rpid) +{ + int i; + + for (i = 0; i < nr_shmems; i++) + if (shmems[i].pid == vpid) + shmems[i].real_pid = rpid; +} + +static int shmem_wait_and_open(struct shmem_info *si) +{ + /* FIXME - not good */ + char path[128]; + unsigned long time = 1000; + + sleep(1); + + while (si->real_pid == 0) + usleep(time); + + sprintf(path, "/proc/%d/map_files/%lx-%lx", + si->real_pid, si->start, si->end); + + while (1) { + int ret = open(path, O_RDWR); + if (ret > 0) + return ret; + + if (ret < 0 && errno != ENOENT) { + perror(" Can't stat shmem"); + return -1; + } + + pr_info("Waiting for [%s] to appear\n", path); + if (time < 20000000) + time <<= 1; + usleep(time); + } +} + +static int try_to_add_shmem(int pid, struct shmem_entry *e) +{ + int i; + + for (i = 0; i < nr_shmems; i++) { + if (shmems[i].start != e->start || + shmems[i].id != e->shmid) + continue; + + if (shmems[i].end != e->end) { + pr_info("Bogus shmem\n"); + return 1; + } + + if (shmems[i].pid > pid) + shmems[i].pid = pid; + + return 0; + } + + if ((nr_shmems + 1) * sizeof(struct shmem_info) >= 4096) { + pr_panic("OOM storing shmems\n"); + return 1; + } + + shmems[nr_shmems].start = e->start; + shmems[nr_shmems].end = e->end; + shmems[nr_shmems].id = e->shmid; + shmems[nr_shmems].pid = pid; + shmems[nr_shmems].real_pid = 0; + + nr_shmems++; + + return 0; +} + +static int try_to_add_pipe(int pid, struct pipe_entry *e, int p_fd) +{ + int i; + + for (i = 0; i < nr_pipes; i++) { + if (pipes[i].id != e->pipeid) + continue; + + if (pipes[i].pid > pid) { + pipes[i].pid = pid; + pipes[i].users++; + } + + return 0; + } + + if ((nr_pipes + 1) * sizeof(struct pipe_info) >= 4096) { + pr_info("OOM storing pipes\n"); + return 1; + } + + memset(&pipes[nr_pipes], 0, sizeof(pipes[nr_pipes])); + + pipes[nr_pipes].id = e->pipeid; + pipes[nr_pipes].pid = pid; + pipes[nr_pipes].users = 1; + + nr_pipes++; + + return 0; +} + +static int prepare_shmem_pid(int pid) +{ + char path[64]; + int sh_fd; + u32 type = 0; + + sprintf(path, "shmem-%d.img", pid); + sh_fd = open(path, O_RDONLY); + if (sh_fd < 0) { + perror("Can't open shmem info"); + return 1; + } + + read(sh_fd, &type, sizeof(type)); + if (type != SHMEM_MAGIC) { + perror("Bad shmem magic"); + return 1; + } + + while (1) { + struct shmem_entry e; + int ret; + + ret = read(sh_fd, &e, sizeof(e)); + if (ret == 0) + break; + + if (ret != sizeof(e)) { + perror("Can't read shmem entry"); + return 1; + } + + if (try_to_add_shmem(pid, &e)) + return 1; + } + + close(sh_fd); + return 0; +} + +static int prepare_pipes_pid(int pid) +{ + char path[64]; + int p_fd; + u32 type = 0; + + sprintf(path, "pipes-%d.img", pid); + p_fd = open(path, O_RDONLY); + if (p_fd < 0) { + perror("Can't open pipes image"); + return 1; + } + + read(p_fd, &type, sizeof(type)); + if (type != PIPES_MAGIC) { + perror("Bad pipes magin"); + return 1; + } + + while (1) { + struct pipe_entry e; + int ret; + + ret = read(p_fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + fprintf(stderr, "Read pipes for %s failed %d of %li read\n", + path, ret, sizeof(e)); + perror("Can't read pipes entry"); + return 1; + } + + if (try_to_add_pipe(pid, &e, p_fd)) + return 1; + + if (e.bytes) + lseek(p_fd, e.bytes, SEEK_CUR); + } + + close(p_fd); + return 0; +} + +static int prepare_shared(int ps_fd) +{ + pr_info("Preparing info about shared resources\n"); + + nr_shmems = 0; + shmems = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); + if (shmems == MAP_FAILED) { + perror("Can't map shmems"); + return 1; + } + + pipes = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); + if (pipes == MAP_FAILED) { + perror("Can't map pipes"); + return 1; + } + + while (1) { + struct pstree_entry e; + int ret; + + ret = read(ps_fd, &e, sizeof(e)); + if (ret == 0) + break; + + if (ret != sizeof(e)) { + perror("Can't read ps"); + return 1; + } + + if (prepare_shmem_pid(e.pid)) + return 1; + + if (prepare_pipes_pid(e.pid)) + return 1; + + lseek(ps_fd, e.nr_children * sizeof(u32), SEEK_CUR); + } + + lseek(ps_fd, sizeof(u32), SEEK_SET); + + show_saved_shmems(); + show_saved_pipes(); + + return 0; +} + +static struct fmap_fd *pop_fmap_fd(unsigned long start) +{ + struct fmap_fd **p, *r; + + pr_info("Looking for %lx : ", start); + + for (p = &fmap_fds; *p != NULL; p = &(*p)->next) { + if ((*p)->start != start) + continue; + + r = *p; + *p = r->next; + pr_info("found\n"); + + return r; + } + + pr_info("not found\n"); + return NULL; +} + +static int open_fe_fd(struct fdinfo_entry *fe, int fd) +{ + char path[PATH_MAX]; + int tmp; + + if (read(fd, path, fe->len) != fe->len) { + fprintf(stderr, "Error reading path"); + return -1; + } + + path[fe->len] = '\0'; + + tmp = open(path, fe->flags); + if (tmp < 0) { + pr_perror("Can't open file %s", path); + return -1; + } + + lseek(tmp, fe->pos, SEEK_SET); + + return tmp; +} + +static int open_fd(int pid, struct fdinfo_entry *fe, int *cfd) +{ + int fd, tmp; + + if (*cfd == (int)fe->addr) { + tmp = dup(*cfd); + if (tmp < 0) { + perror("Can't dup file"); + return 1; + } + + pr_info("%s: Dup for %d\n", __func__, tmp); + + *cfd = tmp; + } + + tmp = open_fe_fd(fe, *cfd); + if (tmp < 0) + return 1; + + fd = reopen_fd_as((int)fe->addr, tmp); + if (fd < 0) { + perror("Can't dup"); + return 1; + } + + return 0; +} + +static int open_fmap(int pid, struct fdinfo_entry *fe, int fd) +{ + int tmp; + struct fmap_fd *new; + + tmp = open_fe_fd(fe, fd); + if (tmp < 0) + return 1; + + pr_info("%d:\t\tWill map %lx to %d\n", pid, (unsigned long)fe->addr, tmp); + + new = malloc(sizeof(*new)); + new->start = fe->addr; + new->fd = tmp; + new->next = fmap_fds; + fmap_fds = new; + + return 0; +} + +static int prepare_fds(int pid) +{ + u32 mag; + char path[64]; + int fdinfo_fd; + + pr_info("%d: Opening files\n", pid); + + sprintf(path, "fdinfo-%d.img", pid); + fdinfo_fd = open(path, O_RDONLY); + if (fdinfo_fd < 0) { + perror("Can't open fdinfo"); + return 1; + } + + read(fdinfo_fd, &mag, 4); + if (mag != FDINFO_MAGIC) { + fprintf(stderr, "Bad file\n"); + return 1; + } + + while (1) { + int ret; + struct fdinfo_entry fe; + + ret = read(fdinfo_fd, &fe, sizeof(fe)); + if (ret == 0) { + close(fdinfo_fd); + return 0; + } + + if (ret < 0) { + perror("Can't read file"); + return 1; + } + if (ret != sizeof(fe)) { + fprintf(stderr, "Error reading\n"); + return 1; + } + + pr_info("\t%d: Got fd for %lx type %d namelen %d\n", pid, + (unsigned long)fe.addr, fe.type, fe.len); + switch (fe.type) { + case FDINFO_FD: + if (open_fd(pid, &fe, &fdinfo_fd)) + return 1; + + break; + case FDINFO_MAP: + if (open_fmap(pid, &fe, fdinfo_fd)) + return 1; + + break; + default: + fprintf(stderr, "Some bullshit in a file\n"); + return 1; + } + } +} + +struct shmem_to_id { + unsigned long addr; + unsigned long end; + unsigned long id; + struct shmem_to_id *next; +}; + +static struct shmem_to_id *my_shmem_ids; + +static unsigned long find_shmem_id(unsigned long addr) +{ + struct shmem_to_id *si; + + for (si = my_shmem_ids; si != NULL; si = si->next) + if (si->addr <= addr && si->end >= addr) + return si->id; + + return 0; +} + +static void save_shmem_id(struct shmem_entry *e) +{ + struct shmem_to_id *si; + + si = malloc(sizeof(*si)); + si->addr = e->start; + si->end = e->end; + si->id = e->shmid; + si->next = my_shmem_ids; + + my_shmem_ids = si; +} + +static int prepare_shmem(int pid) +{ + char path[64]; + int sh_fd; + u32 type = 0; + + sprintf(path, "shmem-%d.img", pid); + sh_fd = open(path, O_RDONLY); + if (sh_fd < 0) { + perror("Can't open shmem info"); + return 1; + } + + read(sh_fd, &type, sizeof(type)); + if (type != SHMEM_MAGIC) { + perror("Bad shmem magic"); + return 1; + } + + while (1) { + struct shmem_entry e; + int ret; + + ret = read(sh_fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + perror("Can't read shmem entry"); + return 1; + } + + save_shmem_id(&e); + } + + close(sh_fd); + return 0; +} + +static int try_fixup_file_map(int pid, struct vma_entry *vi, int fd) +{ + struct fmap_fd *fmfd; + + fmfd = pop_fmap_fd(vi->start); + if (fmfd != NULL) { + pr_info("%d: Fixing %lx vma to %d fd\n", pid, vi->start, fmfd->fd); + lseek(fd, -sizeof(*vi), SEEK_CUR); + vi->fd = fmfd->fd; + if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) { + perror("Can't write img"); + return 1; + } + free(fmfd); + } + + return 0; +} + +static int try_fixup_shared_map(int pid, struct vma_entry *vi, int fd) +{ + struct shmem_info *si; + unsigned long id; + + id = find_shmem_id(vi->start); + if (id == 0) + return 0; + + si = search_shmem(vi->start, id); + pr_info("%d: Search for %016lx shmem %p/%d\n", pid, vi->start, si, si ? si->pid : -1); + + if (si == NULL) { + fprintf(stderr, "Can't find my shmem %016lx\n", vi->start); + return 1; + } + + if (si->pid != pid) { + int sh_fd; + + sh_fd = shmem_wait_and_open(si); + pr_info("%d: Fixing %lx vma to %lx/%d shmem -> %d\n", pid, vi->start, si->id, si->pid, sh_fd); + if (fd < 0) { + perror("Can't open shmem"); + return 1; + } + + lseek(fd, -sizeof(*vi), SEEK_CUR); + vi->fd = sh_fd; + if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) { + perror("Can't write img"); + return 1; + } + } + + return 0; +} + +static int fixup_vma_fds(int pid, int fd) +{ + int offset = sizeof(struct core_entry) + sizeof(u32); + + pr_info("Seek for: %d bytes\n", offset); + lseek(fd, offset, SEEK_SET); + + while (1) { + struct vma_entry vi; + + if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) { + perror("Can't read"); + return 1; + } + + if (vi.start == 0 && vi.end == 0) + return 0; + + if (!(vi.status & VMA_AREA_REGULAR)) + continue; + + if ((vi.status & VMA_FILE_SHARED) || + (vi.status & VMA_FILE_PRIVATE)) { + + pr_info("%d: Fixing %016lx-%016lx %016lx vma\n", pid, vi.start, vi.end, vi.pgoff); + if (try_fixup_file_map(pid, &vi, fd)) + return 1; + } + + if (vi.status & VMA_ANON_SHARED) { + if (try_fixup_shared_map(pid, &vi, fd)) + return 1; + } + } +} + +static inline int should_restore_page(int pid, unsigned long vaddr) +{ + struct shmem_info *si; + unsigned long id; + + id = find_shmem_id(vaddr); + if (id == 0) + return 1; + + si = search_shmem(vaddr, id); + return si->pid == pid; +} + +static char zpage[PAGE_SIZE]; + +static int fixup_pages_data(int pid, int fd) +{ + char path[128]; + int shfd; + u32 mag; + u64 vaddr; + + sprintf(path, "pages-shmem-%d.img", pid); + shfd = open(path, O_RDONLY); + if (shfd < 0) { + perror("Can't open shmem image"); + return 1; + } + + read(shfd, &mag, sizeof(mag)); + if (mag != PAGES_MAGIC) { + fprintf(stderr, "Bad shmem image\n"); + return 1; + } + + /* Find out the last page, which is zero one */ + lseek(fd, -sizeof(struct page_entry), SEEK_END); + read(fd, &vaddr, sizeof(vaddr)); + if (vaddr != 0) { + pr_info("SHIT %lx\n", (unsigned long)vaddr); + return 1; + } + lseek(fd, -sizeof(struct page_entry), SEEK_END); + + while (1) { + int ret; + + ret = read(shfd, &vaddr, sizeof(vaddr)); + if (ret == 0) + break; + + if (ret < 0 || ret != sizeof(vaddr)) { + perror("Can't read vaddr"); + return 1; + } + + if (vaddr == 0) + break; + + if (!should_restore_page(pid, vaddr)) { + lseek(shfd, PAGE_SIZE, SEEK_CUR); + continue; + } + + write(fd, &vaddr, sizeof(vaddr)); + sendfile(fd, shfd, NULL, PAGE_SIZE); + } + + close(shfd); + vaddr = 0; + write(fd, &vaddr, sizeof(vaddr)); + write(fd, zpage, sizeof(zpage)); + + return 0; +} + +static int prepare_image_maps(int fd, int pid) +{ + pr_info("%d: Fixing maps before executing image\n", pid); + + if (fixup_vma_fds(pid, fd)) + return 1; + + if (fixup_pages_data(pid, fd)) + return 1; + + //close(fd); + return 0; +} + +static int execute_image(int pid) +{ + char path[128], elf_path[128]; + int fd, fd_new; + struct stat buf; + + sprintf(path, "core-%d.img", pid); + fd = open(path, O_RDONLY); + if (fd < 0) { + perror("Can't open exec image"); + return 1; + } + + if (fstat(fd, &buf)) { + perror("Can't stat"); + return 1; + } + + sprintf(path, "core-%d.img.out", pid); + unlink(path); + + fd_new = open(path, O_RDWR | O_CREAT | O_EXCL, 0700); + if (fd_new < 0) { + perror("Can't open new image"); + return 1; + } + + sprintf(elf_path, "core-%d.elf", pid); + unlink(elf_path); + + pr_info("%d: Preparing execution image %s (%li bytes)\n", pid, path, buf.st_size); + if (sendfile(fd_new, fd, NULL, buf.st_size) != buf.st_size) { + pr_perror("sendfile failed\n"); + return 1; + } + close(fd); + + if (fchmod(fd_new, 0700)) { + perror("Can't prepare exec image"); + return 1; + } + + if (fstat(fd_new, &buf)) { + perror("Can't stat"); + return 1; + } + + pr_info("fd_new: %li bytes\n", buf.st_size); + + if (prepare_image_maps(fd_new, pid)) + return 1; + + sync(); + + if (convert_to_elf(elf_path, fd_new)) + return 1; + + sync(); + close(fd_new); + + pr_info("%d/%d EXEC ELF-IMAGE\n", pid, getpid()); + return execl(elf_path, elf_path, NULL); +} + +static int create_pipe(int pid, struct pipe_entry *e, struct pipe_info *pi, int pipes_fd) +{ + int pfd[2], tmp; + unsigned long time = 1000; + + pr_info("\t%d: Creating pipe %x\n", pid, e->pipeid); + + if (pipe(pfd) < 0) { + perror("Can't create pipe"); + return 1; + } + + if (e->bytes) { + pr_info("\t%d: Splicing data to %d\n", pid, pfd[1]); + + tmp = splice(pipes_fd, NULL, pfd[1], NULL, e->bytes, 0); + if (tmp != e->bytes) { + fprintf(stderr, "Wanted to restore %d bytes, but got %d\n", + e->bytes, tmp); + if (tmp < 0) + perror("Error splicing data"); + return 1; + } + } + + pi->read_fd = pfd[0]; + pi->write_fd = pfd[1]; + pi->real_pid = getpid(); + + pr_info("\t%d: Done, waiting for others on %d pid with r:%d w:%d\n", + pid, pi->real_pid, pfd[0], pfd[1]); + + while (1) { + if (pi->users == 1) /* only I left */ + break; + + pr_info("\t%d: Waiting for %x pipe to attach (%d users left)\n", + pid, e->pipeid, pi->users - 1); + if (time < 20000000) + time <<= 1; + usleep(time); + } + + pr_info("\t%d: All is ok - reopening pipe for %d\n", pid, e->fd); + if (e->flags & O_WRONLY) { + close(pfd[0]); + tmp = reopen_fd_as(e->fd, pfd[1]); + } else { + close(pfd[1]); + tmp = reopen_fd_as(e->fd, pfd[0]); + } + + if (tmp < 0) { + perror("Can't dup pipe fd"); + return 1; + } + + return 0; +} + +static int attach_pipe(int pid, struct pipe_entry *e, struct pipe_info *pi) +{ + char path[128]; + int tmp, fd; + + pr_info("\t%d: Wating for pipe %x to appear\n", pid, e->pipeid); + + while (pi->real_pid == 0) + usleep(1000); + + if (e->flags & O_WRONLY) + tmp = pi->write_fd; + else + tmp = pi->read_fd; + + sprintf(path, "/proc/%d/fd/%d", pi->real_pid, tmp); + pr_info("\t%d: Attaching pipe %s\n", pid, path); + + fd = open(path, e->flags); + if (fd < 0) { + perror("Can't attach pipe"); + return 1; + } + + pr_info("\t%d: Done, reopening for %d\n", pid, e->fd); + pi->users--; + tmp = reopen_fd_as(e->fd, fd); + if (tmp < 0) { + perror("Can't dup to attach pipe"); + return 1; + } + + return 0; + +} + +static int open_pipe(int pid, struct pipe_entry *e, int *pipes_fd) +{ + struct pipe_info *pi; + + pr_info("\t%d: Opening pipe %x on fd %d\n", pid, e->pipeid, e->fd); + if (e->fd == *pipes_fd) { + int tmp; + + tmp = dup(*pipes_fd); + if (tmp < 0) { + perror("Can't dup file"); + return 1; + } + + *pipes_fd = tmp; + } + + pi = search_pipe(e->pipeid); + if (pi == NULL) { + fprintf(stderr, "BUG: can't find my pipe %x\n", e->pipeid); + return 1; + } + + if (pi->pid == pid) + return create_pipe(pid, e, pi, *pipes_fd); + else + return attach_pipe(pid, e, pi); +} + +static int prepare_pipes(int pid) +{ + char path[64]; + int pipes_fd; + u32 type = 0; + + pr_info("%d: Opening pipes\n", pid); + + sprintf(path, "pipes-%d.img", pid); + pipes_fd = open(path, O_RDONLY); + if (pipes_fd < 0) { + perror("Can't open pipes img"); + return 1; + } + + read(pipes_fd, &type, sizeof(type)); + if (type != PIPES_MAGIC) { + perror("Bad pipes file"); + return 1; + } + + while (1) { + struct pipe_entry e; + int ret; + + ret = read(pipes_fd, &e, sizeof(e)); + if (ret == 0) { + close(pipes_fd); + return 0; + } + if (ret != sizeof(e)) { + perror("Bad pipes entry"); + return 1; + } + + if (open_pipe(pid, &e, &pipes_fd)) + return 1; + } +} + +static int restore_one_task(int pid) +{ + pr_info("%d: Restoring resources\n", pid); + + if (prepare_pipes(pid)) + return 1; + + if (prepare_fds(pid)) + return 1; + + if (prepare_shmem(pid)) + return 1; + + return execute_image(pid); +} + +static int do_child(void *arg) +{ + return restore_task_with_children(getpid(), arg); +} + +static inline int fork_with_pid(int pid, char *pstree_path) +{ + int ret = 0; + void *stack; + + stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0); + if (stack == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + stack += 4 * 4096; + ret = clone(do_child, stack, SIGCHLD | CLONE_CHILD_USEPID, pstree_path, NULL, NULL, &pid); + if (ret < 0) + pr_perror("clone failed\n"); + + return ret; +} + +static int restore_task_with_children(int my_pid, char *pstree_path) +{ + int *pids; + int fd, ret, i; + struct pstree_entry e; + + pr_info("%d: Starting restore\n", my_pid); + + fd = open(pstree_path, O_RDONLY); + if (fd < 0) { + perror("Can't reopen pstree image"); + exit(1); + } + + lseek(fd, sizeof(u32), SEEK_SET); + while (1) { + ret = read(fd, &e, sizeof(e)); + if (ret != sizeof(e)) { + fprintf(stderr, "%d: Read returned %d\n", my_pid, ret); + if (ret < 0) + perror("Can't read pstree"); + exit(1); + } + + if (e.pid != my_pid) { + lseek(fd, e.nr_children * sizeof(u32), SEEK_CUR); + continue; + } + + break; + } + + if (e.nr_children > 0) { + i = e.nr_children * sizeof(int); + pids = malloc(i); + ret = read(fd, pids, i); + if (ret != i) { + perror("Can't read children pids"); + exit(1); + } + + close(fd); + + pr_info("%d: Restoring %d children:\n", my_pid, e.nr_children); + for (i = 0; i < e.nr_children; i++) { + pr_info("\tFork %d from %d\n", pids[i], my_pid); + ret = fork_with_pid(pids[i], pstree_path); + if (ret < 0) + exit(1); + } + } else + close(fd); + + shmem_update_real_pid(my_pid, getpid()); + + return restore_one_task(my_pid); +} + +static int restore_root_task(char *pstree_path, int fd) +{ + struct pstree_entry e; + int ret; + + ret = read(fd, &e, sizeof(e)); + if (ret != sizeof(e)) { + perror("Can't read root pstree entry"); + return 1; + } + + close(fd); + + pr_info("Forking root with %d pid\n", e.pid); + ret = fork_with_pid(e.pid, pstree_path); + if (ret < 0) + return 1; + + wait(NULL); + return 0; +} + +static int restore_all_tasks(pid_t pid) +{ + char path[128]; + int pstree_fd; + u32 type = 0; + + sprintf(path, "pstree-%d.img", pid); + pstree_fd = open(path, O_RDONLY); + if (pstree_fd < 0) { + perror("Can't open pstree image"); + return 1; + } + + read(pstree_fd, &type, sizeof(type)); + if (type != PSTREE_MAGIC) { + perror("Bad pstree magic"); + return 1; + } + + if (prepare_shared(pstree_fd)) + return 1; + + return restore_root_task(path, pstree_fd); +} + +int cr_restore_tasks(pid_t pid, bool leader_only, int leave_stopped) +{ + if (leader_only) + return restore_one_task(pid); + return restore_all_tasks(pid); +} diff --git a/cr-show.c b/cr-show.c new file mode 100644 index 000000000..dd3128f1d --- /dev/null +++ b/cr-show.c @@ -0,0 +1,389 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <dirent.h> + +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include "types.h" +#include "list.h" + +#include "compiler.h" +#include "crtools.h" +#include "syscall.h" +#include "util.h" + +#include "image.h" + +#ifndef CONFIG_X86_64 +# error No x86-32 support yet +#endif + +#define pr_regs4(s, n1, n2, n3, n4) \ + pr_info("%8s: %16lx " \ + "%8s: %16lx " \ + "%8s: %16lx " \ + "%8s: %16lx\n", \ + #n1, s.n1, \ + #n2, s.n2, \ + #n3, s.n3, \ + #n4, s.n4) + +#define pr_regs3(s, n1, n2, n3) \ + pr_info("%8s: %16lx " \ + "%8s: %16lx " \ + "%8s: %16lx\n", \ + #n1, s.n1, \ + #n2, s.n2, \ + #n3, s.n3) + +static char local_buf[PAGE_SIZE]; +static LIST_HEAD(pstree_list); + +/* FIXME: same as dump -- unify */ +static void free_pstree(void) +{ + struct pstree_item *item, *p; + + list_for_each_entry_safe(item, p, &pstree_list, list) { + xfree(item->children); + xfree(item); + } + + INIT_LIST_HEAD(&pstree_list); +} + +static void show_regs(struct cr_fdset *cr_fdset) +{ + struct user_regs_entry regs; + struct desc_struct tls; + int fd_core, i; + + fd_core = cr_fdset->desc[CR_FD_CORE].fd; + if (fd_core < 0) + goto err; + + pr_info("\n\t---[GP registers set]---\n"); + + lseek(fd_core, GET_FILE_OFF(struct core_entry, gpregs), SEEK_SET); + + read_ptr_safe(fd_core, ®s, err); + + pr_regs4(regs, cs, ip, ds, es); + pr_regs4(regs, ss, sp, fs, gs); + pr_regs4(regs, di, si, dx, cx); + pr_regs4(regs, ax, r8, r9, r10); + pr_regs4(regs, r11, r12, r13, r14); + pr_regs3(regs, r15, bp, bx); + pr_regs4(regs, orig_ax, flags, fs_base, gs_base); + + pr_info("\n\t---[TLS area]---\n"); + + lseek(fd_core, GET_FILE_OFF(struct core_entry, tls_array), SEEK_SET); + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) { + read_ptr_safe(fd_core, &tls, err); + pr_info("tls[%2i] = %x %x\n", i, tls.a, tls.b); + } + +err: + return; +} + +static void show_files(struct cr_fdset *cr_fdset) +{ + struct fdinfo_entry e; + int fd_files, ret; + + pr_info("\n"); + pr_info("CR_FD_FDINFO: %s\n", cr_fdset->desc[CR_FD_FDINFO].name); + pr_info("----------------------------------------\n"); + + fd_files = cr_fdset->desc[CR_FD_FDINFO].fd; + + lseek(fd_files, MAGIC_OFFSET, SEEK_SET); + + while (1) { + ret = read(fd_files, &e, sizeof(e)); + if (!ret) + goto err; + if (ret != sizeof(e)) { + pr_perror("Can't read fdinfo entry"); + goto err; + } + + if (e.len) { + ret = read(fd_files, local_buf, e.len); + if (ret != e.len) { + pr_perror("Can't read %d bytes\n", e.len); + goto err; + } + local_buf[e.len] = 0; + pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx --> %s\n", + e.type, e.len, e.flags, e.pos, e.addr, local_buf); + } else + pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx\n", + e.type, e.len, e.flags, e.pos, e.addr); + } + +err: + pr_info("----------------------------------------\n"); +} + +static void show_pipes(struct cr_fdset *cr_fdset) +{ + struct pipe_entry e; + int fd_pipes, ret; + + pr_info("\n"); + pr_info("CR_FD_PIPES: %s\n", cr_fdset->desc[CR_FD_PIPES].name); + pr_info("----------------------------------------\n"); + + fd_pipes = cr_fdset->desc[CR_FD_PIPES].fd; + + lseek(fd_pipes, MAGIC_OFFSET, SEEK_SET); + + while (1) { + ret = read(fd_pipes, &e, sizeof(e)); + if (!ret) + goto err; + if (ret != sizeof(e)) { + pr_perror("Can't read pipe entry\n"); + goto err; + } + pr_info("fd: %8lx pipeid: %8lx flags: %8lx bytes: %8lx\n", + e.fd, e.pipeid, e.flags, e.bytes); + if (e.bytes) + lseek(fd_pipes, e.bytes, SEEK_CUR); + } + +err: + pr_info("----------------------------------------\n"); +} + +static void show_core(struct cr_fdset *cr_fdset) +{ + struct vma_area vma_area = {}; + struct vma_entry ve; + int fd_core, ret; + u64 va; + + pr_info("\n"); + pr_info("CR_FD_CORE: %s\n", cr_fdset->desc[CR_FD_CORE].name); + pr_info("----------------------------------------\n"); + + fd_core = cr_fdset->desc[CR_FD_CORE].fd; + if (fd_core < 0) + goto out; + + show_regs(cr_fdset); + + lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET); + + /* + * Start with VMA, then pages. + */ + pr_info("\n\t---[VMA areas]---\n"); + while (1) { + ret = read(fd_core, &ve, sizeof(ve)); + if (!ret) + break; + if (ret != sizeof(ve)) { + pr_perror("Unable to read VMA\n"); + goto out; + } + + if (is_ending_vma(&ve)) { + pr_info("\n\t---[Pages]---\n"); + while (1) { + ret = read(fd_core, &va, sizeof(va)); + if (!ret) + goto out; + if (ret != sizeof(va)) { + pr_perror("Unable to read VA\n"); + goto out; + } + if (va == 0) + goto out; + pr_info("page va: %16lx\n", va); + lseek(fd_core, PAGE_SIZE, SEEK_CUR); + } + } + + /* Simply in a sake of fancy printing */ + vma_area.vma = ve; + pr_info_vma(&vma_area); + } + +out: + pr_info("----------------------------------------\n"); +} + +static void show_pstree_from_file(int fd, char *name) +{ + int ret; + + pr_info("\n"); + pr_info("CR_FD_PSTREE: %s\n", name); + pr_info("----------------------------------------\n"); + + while (1) { + struct pstree_entry e; + unsigned long i; + u32 child_pid; + + ret = read(fd, &e, sizeof(e)); + if (!ret) + break; + if (ret != sizeof(e)) { + pr_perror("Bad pstree entry"); + break; + } + + pr_info("Process %d number of children: %d\n", + e.pid, e.nr_children); + + for (i = 0; i < e.nr_children; i++) { + ret = read(fd, &child_pid, + sizeof(child_pid)); + pr_info(" %d", child_pid); + } + if (e.nr_children) + pr_info("\n"); + } + + pr_info("----------------------------------------\n"); +} + +static void show_pstree(struct list_head *head, char *name) +{ + struct pstree_item *item; + int i; + + pr_info("\n"); + pr_info("CR_FD_PSTREE: %s\n", name); + pr_info("----------------------------------------\n"); + + list_for_each_entry(item, head, list) { + pr_info("Process %d number of children: %d\n", + item->pid, item->nr_children); + for (i = 0; i < item->nr_children; i++) + pr_info(" %d", item->children[i]); + if (item->nr_children) + pr_info("\n"); + } + + pr_info("----------------------------------------\n"); +} + +static int collect_pstree(pid_t pid, struct cr_fdset *cr_fdset) +{ + int fd = cr_fdset->desc[CR_FD_PSTREE].fd; + struct pstree_item *item = NULL; + struct pstree_entry e; + int ret = -1; + + for (;;) { + size_t size; + + ret = read(fd, &e, sizeof(e)); + if (ret && ret != sizeof(e)) { + pr_perror("Wrong pstree entry\n"); + goto err; + } + + if (!ret) + break; + + item = xmalloc(sizeof(*item)); + if (!item) + goto err; + + size = sizeof(u32) * e.nr_children; + + item->pid = e.pid; + item->nr_children = e.nr_children; + item->children = xmalloc(size); + + if (!item->children) { + pr_error("No memory for children pids\n"); + goto err; + } + + ret = read(fd, item->children, size); + if (ret != size) { + pr_error("An error in reading children pids\n"); + xfree(item->children); + goto err; + } + + list_add_tail(&item->list, &pstree_list); + } + + item = NULL; + ret = 0; + +err: + xfree(item); + return ret; +} + +int cr_show(unsigned long pid, bool leader_only) +{ + struct cr_fdset *cr_fdset; + struct pstree_item *item; + int i, ret = -1; + + cr_fdset = alloc_cr_fdset(pid); + if (!cr_fdset) + goto out; + + ret = prep_cr_fdset_for_restore(cr_fdset, CR_FD_DESC_ALL); + if (ret) + goto out; + + ret = collect_pstree(pid, cr_fdset); + if (ret) + goto out; + + show_pstree(&pstree_list, cr_fdset->desc[CR_FD_PSTREE].name); + + close_cr_fdset(cr_fdset); + free_cr_fdset(&cr_fdset); + + list_for_each_entry(item, &pstree_list, list) { + + cr_fdset = alloc_cr_fdset(item->pid); + if (!cr_fdset) + goto out; + + ret = prep_cr_fdset_for_restore(cr_fdset, CR_FD_DESC_NOPSTREE); + if (ret) + goto out; + + show_core(cr_fdset); + show_pipes(cr_fdset); + show_files(cr_fdset); + + if (leader_only) + break; + } + +out: + free_pstree(); + close_cr_fdset(cr_fdset); + free_cr_fdset(&cr_fdset); + return ret; +} diff --git a/crtools.c b/crtools.c new file mode 100644 index 000000000..c76ca337e --- /dev/null +++ b/crtools.c @@ -0,0 +1,280 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <dirent.h> + +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> +#include <sys/sendfile.h> + +#include "types.h" +#include "list.h" + +#include "compiler.h" +#include "crtools.h" +#include "util.h" + +struct page_entry zero_page_entry; + +static struct cr_fd_desc_tmpl template[CR_FD_MAX] = { + [CR_FD_FDINFO] = { + .fmt = "fdinfo-%li.img", + .magic = FDINFO_MAGIC, + }, + [CR_FD_PAGES] = { + .fmt = "pages-%li.img", + .magic = PAGES_MAGIC, + }, + [CR_FD_PAGES_SHMEM] = { + .fmt = "pages-shmem-%li.img", + .magic = PAGES_MAGIC, + }, + [CR_FD_CORE] = { + .fmt = "core-%li.img", + .magic = CORE_MAGIC, + }, + [CR_FD_PIPES] = { + .fmt = "pipes-%li.img", + .magic = PIPES_MAGIC, + }, + [CR_FD_PSTREE] = { + .fmt = "pstree-%li.img", + .magic = PSTREE_MAGIC, + }, + [CR_FD_SHMEM] = { + .fmt = "shmem-%li.img", + .magic = SHMEM_MAGIC, + }, +}; + +struct cr_fdset *alloc_cr_fdset(pid_t pid) +{ + struct cr_fdset *cr_fdset; + unsigned int i; + + cr_fdset = xzalloc(sizeof(*cr_fdset)); + if (!cr_fdset) + goto err; + + for (i = 0; i < CR_FD_MAX; i++) { + cr_fdset->desc[i].tmpl = &template[i]; + snprintf(cr_fdset->desc[i].name, + sizeof(cr_fdset->desc[i].name), + cr_fdset->desc[i].tmpl->fmt, + (long)pid); + cr_fdset->desc[i].fd = -1; + } + +err: + return cr_fdset; +} + +int prep_cr_fdset_for_dump(struct cr_fdset *cr_fdset, + unsigned long use_mask) +{ + unsigned int i; + u32 magic; + int ret = -1; + + if (!cr_fdset) + goto err; + + cr_fdset->use_mask = use_mask; + + for (i = 0; i < CR_FD_MAX; i++) { + if (!(use_mask & CR_FD_DESC_USE(i))) + continue; + + ret = unlink(cr_fdset->desc[i].name); + if (ret && errno != ENOENT) { + pr_perror("Unable to unlink %s (%s)\n", + cr_fdset->desc[i].name, + strerror(errno)); + goto err; + } else + ret = -1; + cr_fdset->desc[i].fd = open(cr_fdset->desc[i].name, + O_RDWR | O_CREAT | O_EXCL, + CR_FD_PERM); + if (cr_fdset->desc[i].fd < 0) { + pr_perror("Unable to open %s (%s)\n", + cr_fdset->desc[i].name, + strerror(errno)); + goto err; + } + + pr_debug("Opened %s with %d\n", + cr_fdset->desc[i].name, + cr_fdset->desc[i].fd); + + magic = cr_fdset->desc[i].tmpl->magic; + write_ptr_safe(cr_fdset->desc[i].fd, &magic, err); + + /* + * Make sure it's on disk since we might + * need to re-open files in parasite. + */ + fsync(cr_fdset->desc[i].fd); + } + ret = 0; +err: + return ret; +} + +int prep_cr_fdset_for_restore(struct cr_fdset *cr_fdset, + unsigned long use_mask) +{ + unsigned int i; + int ret = -1; + u32 magic; + + if (!cr_fdset) + goto err; + + cr_fdset->use_mask = use_mask; + + for (i = 0; i < CR_FD_MAX; i++) { + if (!(use_mask & CR_FD_DESC_USE(i))) + continue; + + cr_fdset->desc[i].fd = open(cr_fdset->desc[i].name, + O_RDWR, CR_FD_PERM); + if (cr_fdset->desc[i].fd < 0) { + pr_perror("Unable to open %s (%s)\n", + cr_fdset->desc[i].name, + strerror(errno)); + goto err; + } + + pr_debug("Opened %s with %d\n", + cr_fdset->desc[i].name, + cr_fdset->desc[i].fd); + + read_ptr_safe(cr_fdset->desc[i].fd, &magic, err); + if (magic != cr_fdset->desc[i].tmpl->magic) { + pr_error("Magic doesn't match for %s\n", + cr_fdset->desc[i].name); + goto err; + } + + } + ret = 0; +err: + return ret; +} + +void close_cr_fdset(struct cr_fdset *cr_fdset) +{ + unsigned int i; + + if (!cr_fdset) + return; + + for (i = 0; i < CR_FD_MAX; i++) { + if (!(cr_fdset->use_mask & CR_FD_DESC_USE(i))) + continue; + + if (cr_fdset->desc[i].fd >= 0) { + pr_debug("Closed %s with %d\n", + cr_fdset->desc[i].name, + cr_fdset->desc[i].fd); + close(cr_fdset->desc[i].fd); + cr_fdset->desc[i].fd = -1; + } + } +} + +void free_cr_fdset(struct cr_fdset **cr_fdset) +{ + if (cr_fdset && *cr_fdset) { + free(*cr_fdset); + *cr_fdset = NULL; + } +} + +int main(int argc, char *argv[]) +{ + pid_t pid; + int ret = -1; + + BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE); + + if (argc < 3) + goto usage; + + memset(&zero_page_entry, 0, sizeof(zero_page_entry)); + + if (!strcmp(argv[1], "dump")) { + bool leader_only; + + switch (argv[2][1]) { + case 'p': + pid = atol(argv[3]); + leader_only = true; + break; + case 't': + pid = atol(argv[3]); + leader_only = false; + break; + default: + goto usage; + } + + ret = cr_dump_tasks(pid, leader_only, 1); + + } else if (!strcmp(argv[1], "restore")) { + bool leader_only; + + switch (argv[2][1]) { + case 'p': + pid = atol(argv[3]); + leader_only = true; + break; + case 't': + pid = atol(argv[3]); + leader_only = false; + break; + default: + goto usage; + } + + ret = cr_restore_tasks(pid, leader_only, 1); + + } else if (!strcmp(argv[1], "show")) { + bool leader_only = true; + + switch (argv[2][1]) { + case 'p': + leader_only = true; + pid = atol(argv[3]); + break; + case 't': + leader_only = false; + pid = atol(argv[3]); + break; + default: + goto usage; + } + + ret = cr_show(pid, leader_only); + + } else + goto usage; + + return ret; + +usage: + printk("\nUsage:\n"); + printk("\tcrtools (dump|show|restore) (-p|-t) pid\n\n"); + return -1; +} @@ -0,0 +1,213 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <dirent.h> + +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include <sys/sendfile.h> + +#include "types.h" +#include "list.h" + +#include "compiler.h" +#include "crtools.h" +#include "syscall.h" +#include "util.h" + +#include "image.h" +#include "elf.h" + +#define ELF_MAX_PHDR ((65536U / sizeof(Elf64_Phdr)) - 1) +#define ELF_MAX_PAGES (1 << 10) + +/* + * Convert the c/r core file into elf + * executable, the kernel will handle it. + */ +int convert_to_elf(char *elf_path, int fd_core) +{ + Elf64_Ehdr elf_ehdr; + Elf64_Phdr elf_phdr; + + Elf64_Half e_phnum = 0; + Elf64_Addr e_entry = 0; + + struct page_entry page_entry; + unsigned long nrpages = 0; + struct core_entry core; + struct vma_area area; + struct vma_entry vma; + u64 va; + + unsigned long phoff = 0; + unsigned long phoff_regs, phoff_pages; + + int fd_elf; + int ret = -1; + + fd_elf = open(elf_path, O_RDWR | O_CREAT | O_EXCL, 0700); + if (fd_elf < 0) { + pr_perror("Can't open %s\n", elf_path); + goto err; + } + + memset(&elf_ehdr, 0, sizeof(elf_ehdr)); + memset(&area, 0, sizeof(area)); + + memcpy(elf_ehdr.e_ident, ELFMAG, SELFMAG); + elf_ehdr.e_ident[EI_CLASS] = ELFCLASS64; + elf_ehdr.e_ident[EI_DATA] = ELFDATA2LSB; + elf_ehdr.e_ident[EI_VERSION] = EV_CURRENT; + + elf_ehdr.e_type = ET_CKPT; + elf_ehdr.e_machine = EM_X86_64; + elf_ehdr.e_version = EV_CURRENT; + elf_ehdr.e_phoff = sizeof(elf_ehdr); + elf_ehdr.e_ehsize = sizeof(elf_ehdr); + elf_ehdr.e_phentsize = sizeof(Elf64_Phdr); + + /* Get EP */ + lseek(fd_core, MAGIC_OFFSET, SEEK_SET); + read_ptr_safe(fd_core, &core, err_close); + + /* + * Count the numbers of segments. Each segment + * is the VMA record with appropriate permissions. + * Then we need one big segment which would hold + * all the pages dumped. + */ + lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET); + while(1) { + read_ptr_safe(fd_core, &vma, err_close); + if (vma.start == 0 && vma.end == 0) + break; + e_phnum++; + } + + while (1) { + read_ptr_safe(fd_core, &va, err_close); + nrpages++; + if (va == 0) + break; + lseek(fd_core, PAGE_SIZE, SEEK_CUR); + } + + /* Figure out if we're overflowed */ + if (e_phnum > ELF_MAX_PHDR) { + pr_error("Too many VMA areas (%li of %li allowed)\n", + e_phnum, ELF_MAX_PHDR); + goto err_close; + } else if (nrpages > ELF_MAX_PAGES) { + pr_error("Too many pages to restore (%li of %li allowed)\n", + nrpages, ELF_MAX_PAGES); + goto err_close; + } + + /* + * We can write elf header now. + */ + lseek(fd_elf, 0, SEEK_SET); + elf_ehdr.e_phnum = e_phnum + 2; + elf_ehdr.e_entry = core.gpregs.ip; + write_ptr_safe(fd_elf, &elf_ehdr, err_close); + + /* Offset in file (after all headers) */ + phoff = elf_ehdr.e_phnum * sizeof(elf_phdr) + sizeof(elf_ehdr); + + /* VMAs to headers */ + e_phnum = 0; + lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET); + while(1) { + read_ptr_safe(fd_core, &vma, err_close); + if (vma.start == 0 && vma.end == 0) + break; + + memset(&elf_phdr, 0, sizeof(elf_phdr)); + + elf_phdr.p_type = PT_CKPT_VMA; + elf_phdr.p_offset = phoff; + elf_phdr.p_vaddr = vma.start; + elf_phdr.p_paddr = vma.start; + elf_phdr.p_filesz = sizeof(vma); + elf_phdr.p_memsz = vma.end - vma.start; + elf_phdr.p_align = 0x1000; + + if (vma.prot & PROT_READ) + elf_phdr.p_flags |= PF_R; + if (vma.prot & PROT_WRITE) + elf_phdr.p_flags |= PF_W; + if (vma.prot & PROT_EXEC) + elf_phdr.p_flags |= PF_X; + + write_ptr_safe(fd_elf, &elf_phdr, err_close); + + phoff += sizeof(vma); + } + + /* The binfmt header */ + memset(&elf_phdr, 0, sizeof(elf_phdr)); + + elf_phdr.p_type = PT_CKPT_CORE; + elf_phdr.p_flags = PF_R; + elf_phdr.p_offset = phoff; + elf_phdr.p_vaddr = 0; + elf_phdr.p_filesz = sizeof(core); + elf_phdr.p_memsz = sizeof(core); + elf_phdr.p_align = 0x1000; + + write_ptr_safe(fd_elf, &elf_phdr, err_close); + + phoff += sizeof(core); + + /* The pages and binfmt header */ + memset(&elf_phdr, 0, sizeof(elf_phdr)); + + elf_phdr.p_type = PT_CKPT_PAGES; + elf_phdr.p_flags = PF_R; + elf_phdr.p_offset = phoff; + elf_phdr.p_vaddr = 0; + elf_phdr.p_filesz = nrpages * (sizeof(page_entry)); + elf_phdr.p_memsz = nrpages * (sizeof(page_entry)); + elf_phdr.p_align = 0x1000; + + write_ptr_safe(fd_elf, &elf_phdr, err_close); + + /* Now write real contents for program segments */ + lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET); + while(1) { + read_ptr_safe(fd_core, &vma, err_close); + if (vma.start == 0 && vma.end == 0) + break; + area.vma = vma, pr_info_vma(&area); + write_ptr_safe(fd_elf, &vma, err_close); + } + + write_ptr_safe(fd_elf, &core, err_close); + + if (sendfile(fd_elf, fd_core, NULL, nrpages * (sizeof(page_entry))) != + nrpages * (sizeof(page_entry))) { + pr_perror("Can't send %li bytes to elf\n", + (long)(nrpages * (sizeof(page_entry)))); + goto err; + } + + ret = 0; + +err_close: + close(fd_elf); +err: + return ret; +} diff --git a/gen-offsets.sh b/gen-offsets.sh new file mode 100644 index 000000000..0948aed1c --- /dev/null +++ b/gen-offsets.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +name_ifndef=$1 +name_prefix_offset=$2 +name_blob=$3 +name_objname=$4 +name_bin=$5 + +awk_cmd="{ print \"#define $name_prefix_offset\" \$3 \" 0x\" \$1; }" + +echo "/* Autogenerated file, don't edit */" +echo "#ifndef $name_ifndef" +echo "#define $name_ifndef" +echo "" +nm $name_objname | grep ' [Tt] ' | awk "$awk_cmd" +echo "" +echo "static char $name_blob[] = {" +hexdump -v -e '"\t"' -e '8/1 "0x%02x, "' -e '"\n"' $name_bin +echo "};" +echo "" +echo "#endif /* $name_ifndef */" + diff --git a/include/bitops.h b/include/bitops.h new file mode 100644 index 000000000..c391bd9dd --- /dev/null +++ b/include/bitops.h @@ -0,0 +1,54 @@ +#ifndef CR_BITOPS_H_ +#define CR_BITOPS_H_ + +#ifdef CONFIG_X86_64 + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, 8 * sizeof(long)) + +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static void set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); +} + +static void change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); +} + +static int test_bit(int nr, volatile const unsigned long *addr) +{ + int oldbit; + + asm volatile("bt %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +static void clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); +} + + +#else /* CONFIG_X86_64 */ +# error x86-32 is not implemented yet +#endif /* CONFIG_X86_64 */ + +#endif /* CR_BITOPS_H_ */ diff --git a/include/compiler.h b/include/compiler.h new file mode 100644 index 000000000..872428276 --- /dev/null +++ b/include/compiler.h @@ -0,0 +1,57 @@ +#ifndef CR_COMPILER_H_ +#define CR_COMPILER_H_ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) + +#define __section(S) __attribute__ ((__section__(#S))) + +#ifndef __always_inline +# define __always_inline inline __attribute__((always_inline)) +#endif + +#ifndef always_inline +# define always_inline __always_inline +#endif + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) +#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define is_log2(v) (((v) & ((v) - 1)) == 0) + +#endif /* CR_COMPILER_H_ */ diff --git a/include/crtools.h b/include/crtools.h new file mode 100644 index 000000000..f8af18695 --- /dev/null +++ b/include/crtools.h @@ -0,0 +1,105 @@ +#ifndef CRTOOLS_H_ +#define CRTOOLS_H_ + +#include <sys/types.h> + +#include "types.h" +#include "list.h" + +#include "image.h" + +extern struct page_entry zero_page_entry; + +int cr_dump_tasks(pid_t pid, bool leader_only, int leave_stopped); +int cr_restore_tasks(pid_t pid, bool leader_only, int leave_stopped); +int cr_show(unsigned long pid, bool leader_only); +int convert_to_elf(char *elf_path, int fd_core); + +#define CR_FD_PERM 0600 + +enum { + CR_FD_FDINFO, + CR_FD_PAGES, + CR_FD_PAGES_SHMEM, + CR_FD_CORE, + CR_FD_PIPES, + CR_FD_PSTREE, + CR_FD_SHMEM, + + CR_FD_MAX +}; + +/* file descriptors template */ +struct cr_fd_desc_tmpl { + const char *fmt; /* format for the name */ + u32 magic; /* magic in the header */ +}; + +/* file descriptors */ +struct cr_fd_desc { + struct cr_fd_desc_tmpl *tmpl; /* template we refer to */ + char name[64]; /* the name, based on pid */ + int fd; /* descriptor for open/close */ +}; + +struct cr_fdset { + struct cr_fd_desc desc[CR_FD_MAX]; + u32 use_mask; /* + * if descriptor get used,set + * bit here + */ +}; + +#define CR_FD_DESC_USE(type) ((1 << (type))) +#define CR_FD_DESC_ALL ((1 << CR_FD_MAX) - 1) +#define CR_FD_DESC_NOPSTREE (CR_FD_DESC_ALL & ~(CR_FD_DESC_USE(CR_FD_PSTREE))) +#define CR_FD_DESC_NONE (0) + + +struct cr_fdset *alloc_cr_fdset(pid_t pid); +int prep_cr_fdset_for_dump(struct cr_fdset *cr_fdset, + unsigned long use_mask); +int prep_cr_fdset_for_restore(struct cr_fdset *cr_fdset, + unsigned long use_mask); +void close_cr_fdset(struct cr_fdset *cr_fdset); +void free_cr_fdset(struct cr_fdset **cr_fdset); + +struct vma_area { + struct list_head list; + struct vma_entry vma; + unsigned long shmid; + int vm_file_fd; +}; + +#define vma_area_has(vma_area, s) vma_entry_has(&vma_area->vma, s) +#define vma_entry_len(vma) ((vma)->end - (vma)->start) + +struct pstree_item { + struct list_head list; + pid_t pid; /* leader pid */ + u32 nr_children; /* number of children */ + u32 *children; /* array of children */ +}; + +struct pstree_item_info { + struct list_head list; + + pid_t pid; /* leader pid */ + u32 nr_children; /* number of children */ + u32 *children; /* array of children */ + + bool launched; /* set if launched */ +}; + +static inline unsigned long vma_area_size(struct vma_area *vma) +{ + return vma->vma.end - vma->vma.start; +} + +static inline int in_vma_area(struct vma_area *vma, unsigned long addr) +{ + return addr >= (unsigned long)vma->vma.start && + addr < (unsigned long)vma->vma.end; +} + +#endif /* CRTOOLS_H_ */ diff --git a/include/elf.h b/include/elf.h new file mode 100644 index 000000000..96c992d63 --- /dev/null +++ b/include/elf.h @@ -0,0 +1,507 @@ +#ifndef CR_ELF_H +#define CR_ELF_H + +#include "types.h" + +/* Segment types */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 +#define PT_LOOS 0x60000000 +#define PT_HIOS 0x6fffffff +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME 0x6474e550 + +#define PT_CKPT_OFFSET 0x01010101 + +#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1) +#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2) +#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3) + +/* ELF file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_CKPT 5 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* ELF machine types */ +#define EM_NONE 0 +#define EM_M32 1 +#define EM_SPARC 2 +#define EM_386 3 +#define EM_68K 4 +#define EM_88K 5 +#define EM_486 6 /* Not used in Linux at least */ +#define EM_860 7 +#define EM_MIPS 8 /* R3k, bigendian(?) */ +#define EM_MIPS_RS4_BE 10 /* R4k BE */ +#define EM_PARISC 15 +#define EM_SPARC32PLUS 18 +#define EM_PPC 20 +#define EM_PPC64 21 +#define EM_S390 22 +#define EM_SH 42 +#define EM_SPARCV9 43 /* v9 = SPARC64 */ +#define EM_H8_300H 47 +#define EM_H8S 48 +#define EM_IA_64 50 +#define EM_X86_64 62 +#define EM_CRIS 76 +#define EM_V850 87 +#define EM_ALPHA 0x9026 /* Interrim Alpha that stuck around */ +#define EM_CYGNUS_V850 0x9080 /* Old v850 ID used by Cygnus */ +#define EM_S390_OLD 0xA390 /* Obsolete interrim value for S/390 */ + +/* Dynamic type values */ +#define DT_NULL 0 +#define DT_NEEDED 1 +#define DT_PLTRELSZ 2 +#define DT_PLTGOT 3 +#define DT_HASH 4 +#define DT_STRTAB 5 +#define DT_SYMTAB 6 +#define DT_RELA 7 +#define DT_RELASZ 8 +#define DT_RELAENT 9 +#define DT_STRSZ 10 +#define DT_SYMENT 11 +#define DT_INIT 12 +#define DT_FINI 13 +#define DT_SONAME 14 +#define DT_RPATH 15 +#define DT_SYMBOLIC 16 +#define DT_REL 17 +#define DT_RELSZ 18 +#define DT_RELENT 19 +#define DT_PLTREL 20 +#define DT_DEBUG 21 +#define DT_TEXTREL 22 +#define DT_JMPREL 23 +#define DT_LOPROC 0x70000000 +#define DT_HIPROC 0x7fffffff + +/* Auxilliary table entries */ +#define AT_NULL 0 /* end of vector */ +#define AT_IGNORE 1 /* entry should be ignored */ +#define AT_EXECFD 2 /* file descriptor of program */ +#define AT_PHDR 3 /* program headers for program */ +#define AT_PHENT 4 /* size of program header entry */ +#define AT_PHNUM 5 /* number of program headers */ +#define AT_PAGESZ 6 /* system page size */ +#define AT_BASE 7 /* base address of interpreter */ +#define AT_FLAGS 8 /* flags */ +#define AT_ENTRY 9 /* entry point of program */ +#define AT_NOTELF 10 /* program is not ELF */ +#define AT_UID 11 /* real uid */ +#define AT_EUID 12 /* effective uid */ +#define AT_GID 13 /* real gid */ +#define AT_EGID 14 /* effective gid */ +#define AT_PLATFORM 15 /* string identifying CPU for optimizations */ +#define AT_HWCAP 16 /* arch dependent hints at CPU capabilities */ +#define AT_CLKTCK 17 /* frequency at which times() increments */ +/* 18..22 = ? */ +#define AT_SECURE 23 /* secure mode boolean */ + +/* Program header permission flags */ +#define PF_X 0x1 +#define PF_W 0x2 +#define PF_R 0x4 + +/* Section header types */ +#define SHT_NULL 0 +#define SHT_PROGBITS 1 +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_HASH 5 +#define SHT_DYNAMIC 6 +#define SHT_NOTE 7 +#define SHT_NOBITS 8 +#define SHT_REL 9 +#define SHT_SHLIB 10 +#define SHT_DYNSYM 11 +#define SHT_NUM 12 +#define SHT_LOPROC 0x70000000 +#define SHT_HIPROC 0x7fffffff +#define SHT_LOUSER 0x80000000 +#define SHT_HIUSER 0xffffffff + +/* Section header flags */ +#define SHF_WRITE (1 << 0) /* Writable */ +#define SHF_ALLOC (1 << 1) /* Occupies memory during execution */ +#define SHF_EXECINSTR (1 << 2) /* Executable */ +#define SHF_MERGE (1 << 4) /* Might be merged */ +#define SHF_STRINGS (1 << 5) /* Contains nul-terminated strings */ +#define SHF_INFO_LINK (1 << 6) /* `sh_info' contains SHT index */ +#define SHF_LINK_ORDER (1 << 7) /* Preserve order after combining */ +#define SHF_OS_NONCONFORMING (1 << 8) /* Non-standard OS specific handling required */ +#define SHF_GROUP (1 << 9) /* Section is member of a group. */ +#define SHF_TLS (1 << 10) /* Section hold thread-local data. */ + +/* Special section numbers */ +#define SHN_UNDEF 0 +#define SHN_LORESERVE 0xff00 +#define SHN_LOPROC 0xff00 +#define SHN_HIPROC 0xff1f +#define SHN_ABS 0xfff1 +#define SHN_COMMON 0xfff2 +#define SHN_HIRESERVE 0xffff + +/* Section align flag */ +#define SHA_ANY 1 /* No alignment constraint */ + +/* Lenght of magic at the start of a file */ +#define EI_NIDENT 16 + +/* Magic number constants... */ +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +/* Legal values for ST_BIND subfield of st_info (symbol binding). */ +#define STB_LOCAL 0 /* Local symbol */ +#define STB_GLOBAL 1 /* Global symbol */ +#define STB_WEAK 2 /* Weak symbol */ +#define STB_NUM 3 /* Number of defined types. */ +#define STB_LOOS 10 /* Start of OS-specific */ +#define STB_HIOS 12 /* End of OS-specific */ +#define STB_LOPROC 13 /* Start of processor-specific */ +#define STB_HIPROC 15 /* End of processor-specific */ + +/* Symbol types */ +#define STT_NOTYPE 0 /* Symbol type is unspecified */ +#define STT_OBJECT 1 /* Symbol is a data object */ +#define STT_FUNC 2 /* Symbol is a code object */ +#define STT_SECTION 3 /* Symbol associated with a section */ +#define STT_FILE 4 /* Symbol's name is file name */ +#define STT_COMMON 5 /* Symbol is a common data object */ +#define STT_TLS 6 /* Symbol is thread-local data object */ +#define STT_NUM 7 /* Number of defined types. */ + +/* Symbol visibilities */ +#define STV_DEFAULT 0 /* Default symbol visibility rules */ +#define STV_INTERNAL 1 /* Processor specific hidden class */ +#define STV_HIDDEN 2 /* Sym unavailable in other modules */ +#define STV_PROTECTED 3 /* Not preemptible, not exported */ + +/* Both Elf32_Sym and Elf64_Sym use the same one-byte st_info field */ +#define ELF32_ST_BIND(i) ((i) >> 4) +#define ELF32_ST_MKBIND(i) ((i) << 4) /* just a helper */ +#define ELF32_ST_TYPE(i) ((i) & 0xf) +#define ELF32_ST_INFO(b, i) (ELF_ST_MKBIND(b) + ELF_ST_TYPE(i)) + +#define ELF64_ST_BIND(i) ELF32_ST_BIND(i) +#define ELF64_ST_MKBIND(i) ELF32_ST_MKBIND(i) +#define ELF64_ST_TYPE(i) ELF32_ST_TYPE(i) +#define ELF64_ST_INFO(b, i) ELF32_ST_INFO(b, i) + +/* + * ELF standard typedefs (yet more proof that <stdint.h> was way overdue) + */ + +typedef u16 Elf32_Half; +typedef s16 Elf32_SHalf; +typedef u32 Elf32_Word; +typedef s32 Elf32_Sword; +typedef u64 Elf32_Xword; +typedef s64 Elf32_Sxword; + +typedef u32 Elf32_Off; +typedef u32 Elf32_Addr; +typedef u16 Elf32_Section; + +typedef u16 Elf64_Half; +typedef s16 Elf64_SHalf; +typedef u32 Elf64_Word; +typedef s32 Elf64_Sword; +typedef u64 Elf64_Xword; +typedef s64 Elf64_Sxword; + +typedef u64 Elf64_Off; +typedef u64 Elf64_Addr; +typedef u16 Elf64_Section; + +/* + * Dynamic header + */ + +typedef struct elf32_dyn { + Elf32_Sword d_tag; + union { + Elf32_Sword d_val; + Elf32_Addr d_ptr; + } d_un; +} Elf32_Dyn; + +typedef struct elf64_dyn { + Elf64_Sxword d_tag; + union { + Elf64_Xword d_val; + Elf64_Addr d_ptr; + } d_un; +} Elf64_Dyn; + +/* + * Relocations + */ + +#define ELF32_R_SYM(x) ((x) >> 8) +#define ELF32_R_TYPE(x) ((x) & 0xff) + +typedef struct elf32_rel { + Elf32_Addr r_offset; + Elf32_Word r_info; +} Elf32_Rel; + +typedef struct elf32_rela { + Elf32_Addr r_offset; + Elf32_Word r_info; + Elf32_Sword r_addend; +} Elf32_Rela; + +enum reloc32_type { + R_386_32 = 1, /* ordinary absolute relocation */ + R_386_PC32 = 2, /* PC-relative relocation */ + R_386_GOT32 = 3, /* an offset into GOT */ + R_386_PLT32 = 4, /* a PC-relative offset into PLT */ + R_386_COPY = 5, /* ??? */ + R_386_GLOB_DAT = 6, /* ??? */ + R_386_JUMP_SLOT = 7, /* ??? */ + R_386_RELATIVE = 8, /* ??? */ + R_386_GOTOFF = 9, /* an offset from GOT base */ + R_386_GOTPC = 10, /* a PC-relative offset _to_ GOT */ + R_386_TLS_TPOFF = 14, /* Offset in static TLS block */ + R_386_TLS_IE = 15, /* Address of GOT entry for static TLS block offset */ + + /* These are GNU extensions, but useful */ + R_386_16 = 20, /* A 16-bit absolute relocation */ + R_386_PC16 = 21, /* A 16-bit PC-relative relocation */ + R_386_8 = 22, /* An 8-bit absolute relocation */ + R_386_PC8 = 23 /* An 8-bit PC-relative relocation */ +}; + +#define ELF64_R_SYM(x) ((x) >> 32) +#define ELF64_R_TYPE(x) ((x) & 0xffffffff) + +typedef struct elf64_rel { + Elf64_Addr r_offset; + Elf64_Xword r_info; +} Elf64_Rel; + +typedef struct elf64_rela { + Elf64_Addr r_offset; + Elf64_Xword r_info; + Elf64_Sxword r_addend; +} Elf64_Rela; + +enum reloc64_type { + R_X86_64_NONE = 0, /* No reloc */ + R_X86_64_64 = 1, /* Direct 64 bit */ + R_X86_64_PC32 = 2, /* PC relative 32 bit signed */ + R_X86_64_GOT32 = 3, /* 32 bit GOT entry */ + R_X86_64_PLT32 = 4, /* 32 bit PLT address */ + R_X86_64_COPY = 5, /* Copy symbol at runtime */ + R_X86_64_GLOB_DAT = 6, /* Create GOT entry */ + R_X86_64_JUMP_SLOT = 7, /* Create PLT entry */ + R_X86_64_RELATIVE = 8, /* Adjust by program base */ + R_X86_64_GOTPCREL = 9, /* 32 bit signed PC relative offset to GOT */ + R_X86_64_32 = 10, /* Direct 32 bit zero extended */ + R_X86_64_32S = 11, /* Direct 32 bit sign extended */ + R_X86_64_16 = 12, /* Direct 16 bit zero extended */ + R_X86_64_PC16 = 13, /* 16 bit sign extended pc relative */ + R_X86_64_8 = 14, /* Direct 8 bit sign extended */ + R_X86_64_PC8 = 15, /* 8 bit sign extended pc relative */ + R_X86_64_DTPMOD64 = 16, /* ID of module containing symbol */ + R_X86_64_DTPOFF64 = 17, /* Offset in module's TLS block */ + R_X86_64_TPOFF64 = 18, /* Offset in initial TLS block */ + R_X86_64_TLSGD = 19, /* 32 bit signed PC relative offset to two GOT entries for GD symbol */ + R_X86_64_TLSLD = 20, /* 32 bit signed PC relative offset to two GOT entries for LD symbol */ + R_X86_64_DTPOFF32 = 21, /* Offset in TLS block */ + R_X86_64_GOTTPOFF = 22, /* 32 bit signed PC relative offset to GOT entry for IE symbol */ + R_X86_64_TPOFF32 = 23, /* Offset in initial TLS block */ + R_X86_64_PC64 = 24, /* word64 S + A - P */ + R_X86_64_GOTOFF64 = 25, /* word64 S + A - GOT */ + R_X86_64_GOTPC32 = 26, /* word32 GOT + A - P */ + R_X86_64_GOT64 = 27, /* word64 G + A */ + R_X86_64_GOTPCREL64 = 28,/* word64 G + GOT - P + A */ + R_X86_64_GOTPC64 = 29, /* word64 GOT - P + A */ + R_X86_64_GOTPLT64 = 30, /* word64 G + A */ + R_X86_64_PLTOFF64 = 31, /* word64 L - GOT + A */ + R_X86_64_SIZE32 = 32, /* word32 Z + A */ + R_X86_64_SIZE64 = 33, /* word64 Z + A */ + R_X86_64_GOTPC32_TLSDESC = 34, /* word32 */ + R_X86_64_TLSDESC_CALL = 35, /* none */ + R_X86_64_TLSDESC = 36 /* word64?2 */ +}; + +/* + * Symbol + */ + +typedef struct elf32_sym { + Elf32_Word st_name; + Elf32_Addr st_value; + Elf32_Word st_size; + unsigned char st_info; + unsigned char st_other; + Elf32_Half st_shndx; +} Elf32_Sym; + +typedef struct elf64_sym { + Elf64_Word st_name; + unsigned char st_info; + unsigned char st_other; + Elf64_Half st_shndx; + Elf64_Addr st_value; + Elf64_Xword st_size; +} Elf64_Sym; + +/* + * Main file header + */ + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; + Elf64_Off e_phoff; + Elf64_Off e_shoff; + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* + * Program header + */ + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; + Elf64_Addr p_vaddr; + Elf64_Addr p_paddr; + Elf64_Xword p_filesz; + Elf64_Xword p_memsz; + Elf64_Xword p_align; +} Elf64_Phdr; + +/* + * Section headers. + */ + +typedef struct elf32_shdr { + Elf32_Word sh_name; + Elf32_Word sh_type; + Elf32_Word sh_flags; + Elf32_Addr sh_addr; + Elf32_Off sh_offset; + Elf32_Word sh_size; + Elf32_Word sh_link; + Elf32_Word sh_info; + Elf32_Word sh_addralign; + Elf32_Word sh_entsize; +} Elf32_Shdr; + +typedef struct elf64_shdr { + Elf64_Word sh_name; + Elf64_Word sh_type; + Elf64_Xword sh_flags; + Elf64_Addr sh_addr; + Elf64_Off sh_offset; + Elf64_Xword sh_size; + Elf64_Word sh_link; + Elf64_Word sh_info; + Elf64_Xword sh_addralign; + Elf64_Xword sh_entsize; +} Elf64_Shdr; + +/* + * Note header + */ +typedef struct elf32_note { + Elf32_Word n_namesz; /* Name size */ + Elf32_Word n_descsz; /* Content size */ + Elf32_Word n_type; /* Content type */ +} Elf32_Nhdr; + +typedef struct elf64_note { + Elf64_Word n_namesz; /* Name size */ + Elf64_Word n_descsz; /* Content size */ + Elf64_Word n_type; /* Content type */ +} Elf64_Nhdr; + +#endif /* CR_ELF_H */ diff --git a/include/image.h b/include/image.h new file mode 100644 index 000000000..956a1f55d --- /dev/null +++ b/include/image.h @@ -0,0 +1,191 @@ +#ifndef CR_IMAGE_H +#define CR_IMAGE_H + +#include "types.h" +#include "compiler.h" + +#define FDINFO_MAGIC 0x01010101 +#define PAGES_MAGIC 0x20202020 +#define CORE_MAGIC 0xa75b8d43 +#define SHMEM_MAGIC 0x03300330 +#define PIPEFS_MAGIC 0x50495045 +#define PSTREE_MAGIC 0x40044004 +#define PIPES_MAGIC 0x05055050 + +#define FDINFO_FD 1 +#define FDINFO_MAP 2 + +#define PAGE_IMAGE_SIZE 4096 +#define PAGE_RSS 1 + +struct fdinfo_entry { + u8 type; + u8 len; + u16 flags; + u32 pos; + u64 addr; + u8 name[0]; +} __packed; + +struct shmem_entry { + u64 start; + u64 end; + u64 shmid; +} __packed; + +struct pstree_entry { + u32 pid; + u32 nr_children; + u32 children[0]; +} __packed; + +struct pipe_entry { + u32 fd; + u32 pipeid; + u32 flags; + u32 bytes; + u8 data[0]; +} __packed; + +#define VMA_AREA_REGULAR (1 << 0) +#define VMA_AREA_STACK (1 << 1) +#define VMA_AREA_VSYSCALL (1 << 2) +#define VMA_AREA_VDSO (1 << 3) +#define VMA_FORCE_READ (1 << 4) +#define VMA_AREA_HEAP (1 << 5) +#define VMA_FILE_PRIVATE (1 << 6) +#define VMA_FILE_SHARED (1 << 7) +#define VMA_ANON_SHARED (1 << 8) +#define VMA_ANON_PRIVATE (1 << 9) +#define VMA_FORCE_WRITE (1 << 10) +#define VMA_DUMP_ALL (1 << 11) + +#define vma_entry_has(vma, s) (((vma)->status & (s)) == (s)) + +struct vma_entry { + u64 start; + u64 end; + u64 pgoff; + u32 prot; + u32 flags; + u32 status; + u32 pid; + s64 fd; + u64 ino; + u32 dev_maj; + u32 dev_min; +} __packed; + +struct page_entry { + u64 va; + u8 data[PAGE_IMAGE_SIZE]; +} __packed; + +#define HEADER_VERSION 1 +#define HEADER_ARCH_X86_64 1 + +struct image_header { + u16 version; + u16 arch; + u32 flags; +} __packed; + +/* + * PTRACE_GETREGS + * PTRACE_GETFPREGS + * PTRACE_GETFPXREGS dep CONFIG_X86_32 + * PTRACE_GET_THREAD_AREA dep CONFIG_X86_32 || CONFIG_IA32_EMULATION + * PTRACE_GETFDPIC dep CONFIG_BINFMT_ELF_FDPIC + * + * PTRACE_ARCH_PRCTL dep CONFIG_X86_64 + * ARCH_SET_GS/ARCH_GET_FS + * ARCH_SET_FS/ARCH_GET_GS + */ + +#ifdef CONFIG_X86_64 + +struct user_regs_entry { + u64 r15; + u64 r14; + u64 r13; + u64 r12; + u64 bp; + u64 bx; + u64 r11; + u64 r10; + u64 r9; + u64 r8; + u64 ax; + u64 cx; + u64 dx; + u64 si; + u64 di; + u64 orig_ax; + u64 ip; + u64 cs; + u64 flags; + u64 sp; + u64 ss; + u64 fs_base; + u64 gs_base; + u64 ds; + u64 es; + u64 fs; + u64 gs; +} __packed; + +struct desc_struct { + union { + struct { + u32 a; + u32 b; + }; + struct { + u16 limit0; + u16 base0; + unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; + unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; + }; + }; +} __packed; + +struct user_fpregs_entry { + u16 cwd; + u16 swd; + u16 twd; /* Note this is not the same as + the 32bit/x87/FSAVE twd */ + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + u32 padding[24]; +} __packed; + +#define GDT_ENTRY_TLS_ENTRIES 3 + +struct core_entry { + struct image_header hdr; + struct user_regs_entry gpregs; + struct user_fpregs_entry fpregs; + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + u32 personality; +} __packed; + +#endif /* CONFIG_X86_64 */ + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((long) &((TYPE *)0)->MEMBER) +#endif + +/* + * There are always 4 magic bytes at the + * beginning of the every file. + */ +#define MAGIC_OFFSET (sizeof(u32)) +#define GET_FILE_OFF(s, m) (offsetof(s,m) + MAGIC_OFFSET) +#define GET_FILE_OFF_AFTER(s) (sizeof(s) + MAGIC_OFFSET) + +#endif /* CR_IMAGE_H */ diff --git a/include/list.h b/include/list.h new file mode 100644 index 000000000..8a6931643 --- /dev/null +++ b/include/list.h @@ -0,0 +1,286 @@ +#ifndef CR_LIST_H_ +#define CR_LIST_H_ + +/* + * Double linked lists. + */ + +#include "compiler.h" + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) + +struct list_head { + struct list_head *prev, *next; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline void list_replace(struct list_head *old, + struct list_head *new) +{ + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *new) +{ + list_replace(old, new); + INIT_LIST_HEAD(old); +} + +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +static inline int list_is_first(const struct list_head *list, + const struct list_head *head) +{ + return list->prev == head; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; +} + +static inline void list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && + (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +static inline void list_splice(const struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +static inline void list_splice_tail(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +static inline void list_splice_tail_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +#define __list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; \ + pos != (head); \ + pos = n, n = pos->prev) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_prepare_entry(pos, head, member) \ + ((pos) ? : list_entry(head, typeof(*pos), member)) + +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +#define list_safe_reset_next(pos, n, member) \ + n = list_entry(pos->member.next, typeof(*pos), member) + +#endif /* CR_LIST_H_ */ diff --git a/include/parasite-syscall.h b/include/parasite-syscall.h new file mode 100644 index 000000000..85b2ad55c --- /dev/null +++ b/include/parasite-syscall.h @@ -0,0 +1,46 @@ +#ifndef PARASITE_SYSCALL_H_ +#define PARASITE_SYSCALL_H_ + +#include <sys/types.h> +#include <sys/mman.h> + +#include "compiler.h" +#include "types.h" +#include "list.h" +#include "crtools.h" + +#define BUILTIN_SYSCALL_SIZE 8 + +/* parasite control block */ +struct parasite_ctl { + pid_t pid; /* process where we live */ + struct vma_area *vma_area; /* our space */ + unsigned long parasite_ip; /* service routine start ip */ + unsigned long parasite_complete_ip; /* where we end execution */ + unsigned long addr_cmd; /* addr for command */ + unsigned long addr_args; /* address for arguments */ +}; + +int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end); + +void *mmap_seized(pid_t pid, user_regs_struct_t *regs, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset); + +int munmap_seized(pid_t pid, user_regs_struct_t *regs, + void *addr, size_t length); +int kill_seized(pid_t pid, user_regs_struct_t *where); + + +int syscall_seized(pid_t pid, + user_regs_struct_t *where, + user_regs_struct_t *params, + user_regs_struct_t *result); + +int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list, + struct cr_fdset *cr_fdset, int fd_type); + +struct parasite_ctl *parasite_infect_seized(pid_t pid, void *addr_hint, struct list_head *vma_area_list); +int parasite_cure_seized(struct parasite_ctl **p_ctrl, struct list_head *vma_area_list); + +#endif /* PARASITE_SYSCALL_H_ */ diff --git a/include/parasite.h b/include/parasite.h new file mode 100644 index 000000000..97d4dfc05 --- /dev/null +++ b/include/parasite.h @@ -0,0 +1,68 @@ +#ifndef CR_PARASITE_H_ +#define CR_PARASITE_H_ + +#include "compiler.h" +#include "syscall.h" +#include "image.h" + +#define __parasite_head __used __section(.parasite.head.text) +#define __parasite_text __used __section(.parasite.text) +#define __parasite_stack __used __section(.parasite.stack) + +#define PARASITE_STACK_SIZE 2048 +#define PARASITE_ARG_SIZE 256 +#define PARASITE_BRK_SIZE 32768 + +#define PARASITE_MAX_SIZE (64 << 10) + +/* we need own error code for diagnostics */ +#define PARASITE_ERR_FAIL -1024 +#define PARASITE_ERR_OPEN -1025 +#define PARASITE_ERR_MMAP -1026 +#define PARASITE_ERR_MINCORE -1027 +#define PARASITE_ERR_MUNMAP -1028 +#define PARASITE_ERR_CLOSE -1029 +#define PARASITE_ERR_WRITE -1030 +#define PARASITE_ERR_MPROTECT -1031 +#define PARASITE_ERR_CORE_VMA -1032 +#define PARASITE_ERR_CORE_PAGE -1033 + +enum { + PARASITE_CMD_NONE, + PARASITE_CMD_KILLME, + PARASITE_CMD_PINGME, + PARASITE_CMD_DUMPPAGES, + PARASITE_CMD_RESTORECORE, + + PARASITE_CMD_MAX, +}; + +typedef struct { + unsigned long command; + unsigned long args_size; + void *args; +} parasite_args_t; + +typedef struct { + struct vma_entry vma_entry; + unsigned long nrpages_dumped; /* how many pages are dumped */ + unsigned long fd; + unsigned long open_mode; + unsigned long open_flags; + char open_path[64]; +} parasite_args_cmd_dumppages_t; + +/* + * Some useful offsets + */ + +#define PARASITE_ARGS_ADDR(start) \ + ((start) + parasite_blob_offset__parasite_args) +#define PARASITE_CMD_ADDR(start) \ + ((start) + parasite_blob_offset__parasite_cmd) +#define PARASITE_HEAD_ADDR(start) \ + ((start) + parasite_blob_offset__parasite_head_start) +#define PARASITE_COMPLETE_ADDR(start) \ + ((start) + parasite_blob_offset__parasite_service_complete) + +#endif /* CR_PARASITE_H_ */ diff --git a/include/rbtree.h b/include/rbtree.h new file mode 100644 index 000000000..af8e51cc0 --- /dev/null +++ b/include/rbtree.h @@ -0,0 +1,79 @@ +/* + * RBtree implementation adopted from the Linux + * kernel sources. + */ + +#ifndef _LINUX_RBTREE_H +#define _LINUX_RBTREE_H + +#include <stddef.h> + +#define RB_RED 0 +#define RB_BLACK 1 +#define RB_COLOR_MASK 3 + +struct rb_node { + unsigned long rb_parent_color; + struct rb_node *rb_right; + struct rb_node *rb_left; +} __attribute__((aligned(sizeof(long)))); + +struct rb_root { + struct rb_node *rb_node; +}; + + +#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_COLOR_MASK)) +#define rb_color(r) ((r)->rb_parent_color & RB_BLACK) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) rb_color(r) +#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0) +#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->rb_parent_color = (rb->rb_parent_color & RB_COLOR_MASK) |(unsigned long)p; +} + +static inline void rb_set_color(struct rb_node *rb, int color) +{ + rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color; +} + +#define RB_ROOT (struct rb_root) { NULL, } +#define rb_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) + +static inline void rb_init_node(struct rb_node *rb) +{ + rb->rb_parent_color = 0; + rb->rb_right = NULL; + rb->rb_left = NULL; + RB_CLEAR_NODE(rb); +} + +void rb_insert_color(struct rb_node *, struct rb_root *); +void rb_erase(struct rb_node *, struct rb_root *); + +struct rb_node *rb_next(const struct rb_node *node); +struct rb_node *rb_prev(const struct rb_node *node); +struct rb_node *rb_first(const struct rb_root *node); +struct rb_node *rb_last(const struct rb_root *node); + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root); + +static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + node->rb_parent_color = (unsigned long )parent; + node->rb_left = node->rb_right = NULL; + + *rb_link = node; +} + +#endif /* _LINUX_RBTREE_H */ diff --git a/include/syscall.h b/include/syscall.h new file mode 100644 index 000000000..6691171fd --- /dev/null +++ b/include/syscall.h @@ -0,0 +1,181 @@ +#ifndef CR_SYSCALL_H_ +#define CR_SYSCALL_H_ + +#include <sys/types.h> + +#include "compiler.h" + +#ifdef CONFIG_X86_64 + +static long syscall0(int nr) +{ + long ret; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr) + : "memory"); + return ret; +} + +static long syscall1(int nr, unsigned long arg0) +{ + long ret; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "D" (arg0) + : "memory"); + return ret; +} + +static long syscall2(int nr, unsigned long arg0, unsigned long arg1) +{ + long ret; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "D" (arg0), "S" (arg1) + : "memory"); + return ret; +} + +static long syscall3(int nr, unsigned long arg0, unsigned long arg1, + unsigned long arg2) +{ + long ret; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2) + : "memory"); + return ret; +} + +static long syscall4(int nr, unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3) +{ + register unsigned long r10 asm("r10") = r10; + long ret; + + r10 = arg3; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2) + : "memory"); + return ret; +} + +static long syscall5(int nr, unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4) +{ + register unsigned long r10 asm("r10") = r10; + register unsigned long r8 asm("r8") = r8; + long ret; + + r10 = arg3; + r8 = arg4; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2) + : "memory"); + return ret; +} + +static long syscall6(int nr, unsigned long arg0, unsigned long arg1, + unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + register unsigned long r10 asm("r10") = r10; + register unsigned long r8 asm("r8") = r8; + register unsigned long r9 asm("r9") = r9; + long ret; + + r10 = arg3; + r8 = arg4; + r9 = arg5; + asm volatile("syscall" + : "=a" (ret) + : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2) + : "memory"); + return ret; +} + +/* + * syscall codes + */ +#define __NR_read 0 +#define __NR_write 1 +#define __NR_open 2 +#define __NR_close 3 +#define __NR_lseek 8 +#define __NR_mmap 9 +#define __NR_mprotect 10 +#define __NR_munmap 11 +#define __NR_mincore 27 +#define __NR_dup 32 +#define __NR_dup2 33 +#define __NR_pause 34 +#define __NR_nanosleep 35 +#define __NR_getpid 39 +#define __NR_exit 60 + +static unsigned long sys_pause(void) +{ + return syscall0(__NR_pause); +} + +static unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, unsigned long offset) +{ + return syscall6(__NR_mmap, (unsigned long)addr, + len, prot, flags, fd, offset); +} + +static unsigned long sys_munmap(void *addr,unsigned long len) +{ + return syscall2(__NR_munmap, (unsigned long)addr, len); +} + +static long sys_open(const char *filename, unsigned long flags, unsigned long mode) +{ + return syscall3(__NR_open, (unsigned long)filename, flags, mode); +} + +static long sys_close(int fd) +{ + return syscall1(__NR_close, fd); +} + +static long sys_write(unsigned long fd, const void *buf, unsigned long count) +{ + return syscall3(__NR_write, fd, (unsigned long)buf, count); +} + +static long sys_mincore(unsigned long addr, unsigned long size, void *vec) +{ + return syscall3(__NR_mincore, addr, size, (unsigned long)vec); +} + +static long sys_lseek(unsigned long fd, unsigned long offset, unsigned long origin) +{ + return syscall3(__NR_lseek, fd, offset, origin); +} + +static long sys_mprotect(unsigned long start, unsigned long len, unsigned long prot) +{ + return syscall3(__NR_mprotect, start, len, prot); +} + +static long sys_nanosleep(struct timespec *req, struct timespec *rem) +{ + return syscall2(__NR_nanosleep, (unsigned long)req, (unsigned long)rem); +} + +static long sys_read(unsigned long fd, void *buf, unsigned long count) +{ + return syscall3(__NR_read, fd, (unsigned long)buf, count); +} + +#else /* CONFIG_X86_64 */ +# error x86-32 bit mode not yet implemented +#endif /* CONFIG_X86_64 */ + +#endif /* CR_SYSCALL_H_ */ diff --git a/include/types.h b/include/types.h new file mode 100644 index 000000000..5e636df30 --- /dev/null +++ b/include/types.h @@ -0,0 +1,132 @@ +#ifndef CR_TYPES_H_ +#define CR_TYPES_H_ + +#include <stdint.h> +#include <stdbool.h> + +#include "bitops.h" + +/* some constants for ptrace */ +#define PTRACE_SEIZE 0x4206 +#define PTRACE_INTERRUPT 0x4207 +#define PTRACE_LISTEN 0x4208 + +#define PTRACE_SEIZE_DEVEL 0x80000000 + +#define PTRACE_EVENT_FORK 1 +#define PTRACE_EVENT_VFORK 2 +#define PTRACE_EVENT_CLONE 3 +#define PTRACE_EVENT_EXEC 4 +#define PTRACE_EVENT_VFORK_DONE 5 +#define PTRACE_EVENT_EXIT 6 +#define PTRACE_EVENT_STOP 7 + +#define PTRACE_O_TRACESYSGOOD 0x00000001 +#define PTRACE_O_TRACEFORK 0x00000002 +#define PTRACE_O_TRACEVFORK 0x00000004 +#define PTRACE_O_TRACECLONE 0x00000008 +#define PTRACE_O_TRACEEXEC 0x00000010 +#define PTRACE_O_TRACEVFORKDONE 0x00000020 +#define PTRACE_O_TRACEEXIT 0x00000040 + +/* fcntl */ +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif +#ifndef F_SETPIPE_SZ +# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#endif +#ifndef F_GETPIPE_SZ +# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) +#endif + +#define CLONE_CHILD_USEPID 0x02000000 +#define CLONE_VFORK 0x00004000 + +typedef uint64_t u64; +typedef int64_t s64; +typedef unsigned int u32; +typedef signed int s32; +typedef unsigned short u16; +typedef signed short s16; +typedef unsigned char u8; +typedef signed char s8; + +#define MAJOR(dev) ((dev)>>8) + +#ifdef CONFIG_X86_64 + +typedef struct { + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; + unsigned long bp; + unsigned long bx; + unsigned long r11; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long si; + unsigned long di; + unsigned long orig_ax; + unsigned long ip; + unsigned long cs; + unsigned long flags; + unsigned long sp; + unsigned long ss; + unsigned long fs_base; + unsigned long gs_base; + unsigned long ds; + unsigned long es; + unsigned long fs; + unsigned long gs; +} user_regs_struct_t; + +typedef struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; /* Note this is not the same as + the 32bit/x87/FSAVE twd */ + unsigned short fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + u32 padding[24]; +} user_fpregs_struct_t; + +#else /* CONFIG_X86_64 */ + +typedef struct { + unsigned long bx; + unsigned long cx; + unsigned long dx; + unsigned long si; + unsigned long di; + unsigned long bp; + unsigned long ax; + unsigned long ds; + unsigned long es; + unsigned long fs; + unsigned long gs; + unsigned long orig_ax; + unsigned long ip; + unsigned long cs; + unsigned long flags; + unsigned long sp; + unsigned long ss; +} user_regs_struct_t; + +#endif /* CONFIG_X86_64 */ + +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif + +#endif /* CR_TYPES_H_ */ diff --git a/include/util.h b/include/util.h new file mode 100644 index 000000000..86d71e06f --- /dev/null +++ b/include/util.h @@ -0,0 +1,178 @@ +#ifndef UTIL_H_ +#define UTIL_H_ + +/* + * Some bits are stolen from perf and kvm tools + */ +#include <string.h> +#include <stdlib.h> +#include <signal.h> + +#include <sys/types.h> + +#include "compiler.h" +#include "types.h" + +extern void printk(const char *format, ...); + +#define pr_info(fmt, ...) printk(fmt, ##__VA_ARGS__) +#define pr_error(fmt, ...) printk("Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_panic(fmt, ...) printk("PANIC (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define pr_warning(fmt, ...) printk("Warning: " fmt, ##__VA_ARGS__) + +#define pr_error_jmp(label) \ + do { \ + printk("EJMP: %s:%d\n", __FILE__, __LINE__); \ + goto label; \ + } while (0) + +#define jerr(code, label) \ + do { \ + if ((code)) \ + pr_error_jmp(label); \ + } while (0) + +#define jerr_cond(code, cond, label) \ + do { \ + if ((code) cond) \ + pr_error_jmp(label); \ + } while (0) + +#define jerr_rc(code, rc, label) \ + do { \ + rc = (code); \ + if (rc) \ + pr_error_jmp(label); \ + } while (0) + +#if 0 +#define pr_debug(fmt, ...) \ + do { \ + printk("%s (%s:%d): " fmt, \ + __func__, __FILE__, __LINE__, \ + ##__VA_ARGS__); \ + } while (0) +#else +#define pr_debug(fmt, ...) +#endif + +#define die(fmt, ...) \ + do { \ + printk("die (%s:%d): " fmt, __FILE__, \ + __LINE__, ##__VA_ARGS__); \ + exit(1); \ + } while (0) + +#define pr_perror(fmt, ...) \ + do { \ + pr_error("%s: " fmt, strerror(errno), \ + ##__VA_ARGS__); \ + } while (0) + +#define stop_task(pid) kill(pid, SIGSTOP) +#define continue_task(pid) kill(pid, SIGCONT) + +#define write_ptr(fd, ptr) \ + write(fd, (ptr), sizeof(*(ptr))) + +#define write_ptr_safe(fd, ptr, err) \ + jerr(write_ptr(fd, ptr) != sizeof(*(ptr)), err) + +#define write_safe(fd, ptr, size, err) \ + jerr(write(fd, (ptr), (size)) != (size), err) + +#define write_safe_imm(fd, imm, err) \ + do { \ + typeof(imm) x__ = imm; \ + write_ptr_safe(fd, &x__, err); \ + } while (0) + +#define read_safe(fd, ptr, size, err) \ + jerr(read(fd, ptr, (size)) != (size), err) + +#define read_ptr_safe(fd, ptr, err) \ + jerr(read(fd, ptr, sizeof(*(ptr))) != sizeof(*(ptr)), err) + +#define read_safe_eof(fd, ptr, size, rc, err, eof) \ + do { \ + rc = read(fd, ptr, (size)); \ + if (!rc) \ + goto eof; \ + if (rc != (size)) \ + goto err; \ + } while (0) + +#define read_ptr_safe_eof(fd, ptr, rc, err, eof) \ + read_safe_eof(fd, ptr, sizeof(*(ptr)), rc, err, eof) + +int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); +int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); +int ptrace_show_area(pid_t pid, void *addr, long bytes); +int ptrace_show_area_r(pid_t pid, void *addr, long bytes); + +int seize_task(pid_t pid); +int unseize_task(pid_t pid); + +void printk_registers(user_regs_struct_t *regs); +void printk_siginfo(siginfo_t *siginfo); + +struct vma_area; +struct list_head; + +void printk_vma(struct vma_area *vma_area); + +/* A special marker */ +#define is_ending_vma(vma) ((vma)->start == 0 && (vma)->end == 0) + +#define pr_info_vma_list(head) \ + do { \ + struct vma_area *vma; \ + list_for_each_entry(vma, head, list) \ + pr_info_vma(vma); \ + } while (0) + +#define alloc_vma_area() \ + ({ \ + struct vma_area *p__ = xzalloc(sizeof(*p__)); \ + if (p__) { \ + p__->shmid = -1; \ + p__->vm_file_fd = -1; \ + p__->vma.fd = -1; \ + } \ + p__; \ + }) + +#define pr_info_vma(vma_area) printk_vma(vma_area) +#define pr_info_registers(regs) printk_registers(regs) +#define pr_info_siginfo(siginfo) printk_siginfo(siginfo) + +int reopen_fd_as(int new_fd, int old_fd); +int parse_maps(pid_t pid, struct list_head *vma_list); + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op( __VA_ARGS__ ); \ + if (!___p) \ + pr_error("%s: Can't allocate %li bytes\n", \ + __func__, (long)(size)); \ + ___p; \ + }) + +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) if (p) free(p) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -1; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#endif /* UTIL_H_ */ diff --git a/kernel/binfmt-elf-for-cr-4 b/kernel/binfmt-elf-for-cr-4 new file mode 100644 index 000000000..b7218fbf6 --- /dev/null +++ b/kernel/binfmt-elf-for-cr-4 @@ -0,0 +1,636 @@ +elf: Add support for loading files + +This patch add ability to run checkpoint files by enhancing +Elf file format. + +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +--- + arch/x86/include/asm/elf.h | 3 + arch/x86/vdso/vma.c | 22 ++ + fs/binfmt_elf.c | 404 ++++++++++++++++++++++++++++++++++++++++++++- + include/linux/elf_ckpt.h | 135 +++++++++++++++ + 4 files changed, 562 insertions(+), 2 deletions(-) + +Index: linux-2.6.git/arch/x86/include/asm/elf.h +=================================================================== +--- linux-2.6.git.orig/arch/x86/include/asm/elf.h ++++ linux-2.6.git/arch/x86/include/asm/elf.h +@@ -314,7 +314,8 @@ struct linux_binprm; + #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 + extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); +- ++extern int arch_setup_additional_pages_at(struct linux_binprm *bprm, ++ void *addr, int uses_interp); + extern int syscall32_setup_pages(struct linux_binprm *, int exstack); + #define compat_arch_setup_additional_pages syscall32_setup_pages + +Index: linux-2.6.git/arch/x86/vdso/vma.c +=================================================================== +--- linux-2.6.git.orig/arch/x86/vdso/vma.c ++++ linux-2.6.git/arch/x86/vdso/vma.c +@@ -137,6 +137,28 @@ up_fail: + return ret; + } + ++int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp) ++{ ++ struct mm_struct *mm = current->mm; ++ int ret; ++ ++ if (!vdso_enabled) ++ return 0; ++ ++ down_write(&mm->mmap_sem); ++ current->mm->context.vdso = addr; ++ ret = install_special_mapping(mm, (unsigned long)addr, vdso_size, ++ VM_READ | VM_EXEC | ++ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | ++ VM_ALWAYSDUMP, ++ vdso_pages); ++ if (ret) ++ current->mm->context.vdso = NULL; ++ ++ up_write(&mm->mmap_sem); ++ return ret; ++} ++ + static __init int vdso_setup(char *s) + { + vdso_enabled = simple_strtoul(s, NULL, 0); +Index: linux-2.6.git/fs/binfmt_elf.c +=================================================================== +--- linux-2.6.git.orig/fs/binfmt_elf.c ++++ linux-2.6.git/fs/binfmt_elf.c +@@ -36,6 +36,11 @@ + #include <asm/param.h> + #include <asm/page.h> + ++#include <linux/elf_ckpt.h> ++#include <linux/flex_array.h> ++#include <asm/tlbflush.h> ++#include <asm/desc.h> ++ + static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); + static int load_elf_library(struct file *); + static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, +@@ -556,6 +561,395 @@ static unsigned long randomize_stack_top + #endif + } + ++#ifdef CONFIG_X86_64 ++ ++static int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs, ++ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr) ++{ ++ struct thread_struct *thread = ¤t->thread; ++ struct elf_phdr *elf_phdr_pages; ++ struct elf_phdr *elf_phdr_core; ++ struct flex_array *fa = NULL; ++ struct vma_entry *vma_entry_ptr; ++ int nr_vma_found, nr_vma_mapped; ++ struct vma_entry vma_entry; ++ struct file *file = NULL; ++ unsigned long elf_entry; ++ unsigned long map_addr; ++ ++ unsigned long start_code, end_code, start_data, end_data; ++ unsigned long start_brk, brk, start_stack; ++ unsigned long elf_bss, elf_brk; ++ unsigned long vdso; ++ ++ struct core_entry core_entry; ++ int i, ret = -ENOEXEC; ++ loff_t off; ++ ++ int cpu, seg; ++ ++ BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES); ++ BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE); ++ ++ elf_phdr_core = NULL; ++ elf_phdr_pages = NULL; ++ nr_vma_found = 0; ++ nr_vma_mapped = 0; ++ ++ elf_bss = 0; ++ elf_brk = 0; ++ ++ start_code = -1UL; ++ end_code = 0; ++ ++ start_data = -1UL; ++ end_data = 0; ++ ++ start_stack = -1UL; ++ start_brk = -1UL; ++ brk = -1UL; ++ ++ vdso = -1UL; ++ ++ fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL); ++ if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) { ++ ret = -ENOMEM; ++ if (fa) { ++ flex_array_free(fa); ++ fa = NULL; ++ goto out; ++ } ++ } ++ ++ /* Flush all traces of the currently running executable */ ++ ret = flush_old_exec(bprm); ++ if (ret) ++ goto out; ++ ++ /* No return point */ ++ current->flags &= ~PF_FORKNOEXEC; ++ current->mm->def_flags = 0; ++ ++ /* ++ * We don't care about parameters passed (such as argc, argv, env) ++ * when execute checkpoint file because we're to substitute ++ * all the things anyway -- so drop any previous memory mappings. ++ */ ++ do_munmap(current->mm, 0, TASK_SIZE); ++ ++ SET_PERSONALITY(loc->elf_ex); ++ ++ for (i = 0; i < elf_ex->e_phnum; i++) { ++ ++ switch (elf_phdr[i].p_type) { ++ case PT_CKPT_VMA: ++ ret = kernel_read(bprm->file, elf_phdr[i].p_offset, ++ (char *)&vma_entry, sizeof(vma_entry)); ++ if (ret != sizeof(vma_entry)) { ++ pr_err("elf-ckpt: Can't read vma_entry\n"); ++ ret = -EIO; ++ goto out; ++ } ++ if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL)) ++ BUG(); ++ ++ /* We need to know if there is executable stack */ ++ if (vma_entry.status & VMA_AREA_STACK) { ++ if (vma_entry.flags & PROT_EXEC) ++ current->personality |= READ_IMPLIES_EXEC; ++ } ++ ++ nr_vma_found++; ++ continue; ++ case PT_CKPT_CORE: ++ elf_phdr_core = &elf_phdr[i]; ++ continue; ++ case PT_CKPT_PAGES: ++ elf_phdr_pages = &elf_phdr[i]; ++ continue; ++ default: ++ continue; ++ } ++ } ++ ++ /* Be sure it has the file structure we expect to see. */ ++ if (!elf_phdr_pages || !elf_phdr_core || !nr_vma_found) { ++ send_sig(SIGKILL, current, 0); ++ ret = -ENOEXEC; ++ goto out; ++ } ++ ++ /* ++ * VMA randomization still needs to be set (just in case if ++ * the program we restore will exec something else later). ++ */ ++ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) ++ current->flags |= PF_RANDOMIZE; ++ ++ setup_new_exec(bprm); ++ ++ current->mm->free_area_cache = current->mm->mmap_base; ++ current->mm->cached_hole_size = 0; ++ ++ for (i = 0; i < nr_vma_found; i++) { ++ vma_entry_ptr = flex_array_get(fa, i); ++ ++ if (vma_entry_ptr->status & VMA_AREA_HEAP) { ++ start_brk = vma_entry_ptr->start; ++ } ++ ++ if (vma_entry_ptr->status & VMA_AREA_VDSO) { ++ vdso = vma_entry_ptr->start; ++ } ++ ++ if (!(vma_entry_ptr->status & VMA_AREA_REGULAR)) ++ continue; ++ ++ if (vma_entry_ptr->fd != -1) { ++ file = fget((unsigned int)vma_entry_ptr->fd); ++ if (!file) { ++ send_sig(SIGKILL, current, 0); ++ ret = -EBADF; ++ goto out_unmap; ++ } ++ ++ /* Reuse this field to handle error cases */ ++ vma_entry_ptr->fd = (__u64)file; ++ } else ++ file = NULL; ++ ++ down_write(¤t->mm->mmap_sem); ++ map_addr = do_mmap(file, ++ vma_entry_ptr->start, ++ vma_entry_ptr->end - vma_entry_ptr->start, ++ vma_entry_ptr->prot, ++ vma_entry_ptr->flags | MAP_FIXED, ++ vma_entry_ptr->pgoff); ++ up_write(¤t->mm->mmap_sem); ++ ++ if (file) { ++ fput(file); ++ do_close((unsigned int)vma_entry_ptr->fd); ++ } ++ ++ if (BAD_ADDR(map_addr)) { ++ send_sig(SIGKILL, current, 0); ++ ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL; ++ goto out_unmap; ++ } ++ ++ /* ++ * FIXME ++ * Some heuristics to guess previously loaded real ++ * elf file structure. Probably this things should ++ * be exported via /proc somewhere instead. ++ */ ++ ++ if (vma_entry_ptr->status & VMA_AREA_STACK) { ++ /* Note if stack is VM_GROWSUP -- it should be reversed */ ++ start_stack = vma_entry_ptr->start; ++ } ++ ++ if (vma_entry_ptr->prot & PROT_EXEC) { ++ if (start_code > vma_entry_ptr->start) ++ start_code = vma_entry_ptr->start; ++ if (end_code < vma_entry_ptr->end) ++ end_code = vma_entry_ptr->end; ++ } else { ++ /* ++ * Neither .bss nor .data was being file mapped. ++ * FIXME: .rodata are loaded by interp. ++ */ ++ if (!file) { ++ if (vma_entry_ptr->prot & (PROT_WRITE)) { ++ if (start_data > vma_entry_ptr->start) ++ start_data = vma_entry_ptr->start; ++ if (end_data < vma_entry_ptr->end) ++ end_data = vma_entry_ptr->end; ++ } ++ } ++ } ++ ++ nr_vma_mapped++; ++ } ++ ++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES ++ if (vdso == -1UL) { ++ pr_err("elf-ckpt: Can't find VDSO address\n"); ++ ret = -ENOEXEC; ++ goto out_unmap; ++ } ++#endif ++ ++ /* Restore core data */ ++ ret = kernel_read(bprm->file, elf_phdr_core->p_offset, ++ (char *)&core_entry, sizeof(core_entry)); ++ if (ret != sizeof(core_entry)) { ++ pr_err("elf-ckpt: Can't read core_entry\n"); ++ ret = -EIO; ++ goto out_unmap; ++ } ++ ++ elf_entry = core_entry.gpregs.ip; ++ bprm->p = start_stack; ++ ++ current->mm->start_code = start_code; ++ current->mm->end_code = end_code; ++ current->mm->start_data = start_data; ++ current->mm->end_data = end_data; ++ current->mm->start_stack = start_stack; ++ current->mm->start_brk = start_brk; ++ current->mm->brk = brk; ++ ++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES ++ ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0); ++ if (ret) { ++ pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n", ++ vdso, ret); ++ goto out_unmap; ++ } ++#endif ++ ++ /* ++ * Restore pages ++ */ ++ off = elf_phdr_pages->p_offset; ++ while (1) { ++ struct vm_area_struct *vma; ++ struct page *page; ++ void *page_data; ++ __u64 va; ++ ++ ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va)); ++ if (ret != sizeof(va)) { ++ pr_err("elf-ckpt: Can't read page virtual address: " ++ "ret = %d off = %lx\n", ret, (unsigned long)off); ++ ret = -EIO; ++ goto out_unmap; ++ } ++ ++ /* End of pages reached */ ++ if (!va) ++ break; ++ ++ vma = find_vma(current->mm, (unsigned long)va); ++ if (!vma) { ++ pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va); ++ ret = -ESRCH; ++ goto out_unmap; ++ } ++ ++ ret = get_user_pages(current, current->mm, (unsigned long)va, ++ 1, 1, 1, &page, NULL); ++ if (ret != 1) { ++ pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va); ++ ret = -EFAULT; ++ goto out_unmap; ++ } ++ ++ page_data = kmap(page); ++ ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE); ++ kunmap(page); ++ put_page(page); ++ ++ if (ret != PAGE_SIZE) { ++ pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va); ++ ret = -EFAULT; ++ goto out_unmap; ++ } ++ ++ off += sizeof(va) + PAGE_SIZE; ++ } ++ ++ set_binfmt(&elf_format); ++ ++ /* ++ * Registers setup. ++ * ++ * Since we might be modifying MSRs we're ++ * to be sure the task wont be preempted ++ * until modification is complete. ++ */ ++ cpu = get_cpu(); ++ ++ regs->ip = core_entry.gpregs.ip; ++ regs->sp = core_entry.gpregs.sp; ++ regs->cs = core_entry.gpregs.cs; ++ regs->ss = core_entry.gpregs.ss; ++ regs->flags = core_entry.gpregs.flags; ++ regs->r15 = core_entry.gpregs.r15; ++ regs->r14 = core_entry.gpregs.r14; ++ regs->r13 = core_entry.gpregs.r13; ++ regs->r12 = core_entry.gpregs.r12; ++ regs->bp = core_entry.gpregs.bp; ++ regs->bx = core_entry.gpregs.bx; ++ regs->r11 = core_entry.gpregs.r11; ++ regs->r10 = core_entry.gpregs.r10; ++ regs->r8 = core_entry.gpregs.r8; ++ regs->ax = core_entry.gpregs.ax; ++ regs->cx = core_entry.gpregs.cx; ++ regs->dx = core_entry.gpregs.dx; ++ regs->si = core_entry.gpregs.si; ++ regs->di = core_entry.gpregs.di; ++ regs->orig_ax = core_entry.gpregs.orig_ax; ++ ++ thread->usersp = core_entry.gpregs.sp; ++ thread->ds = core_entry.gpregs.ds; ++ thread->es = core_entry.gpregs.es; ++ thread->fs = core_entry.gpregs.fs; ++ thread->gs = core_entry.gpregs.gs; ++ ++ thread->fsindex = thread->fs; ++ thread->gsindex = thread->gs; ++ ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) { ++ thread->tls_array[i].a = core_entry.tls_array[i].a; ++ thread->tls_array[i].b = core_entry.tls_array[i].b; ++ } ++ ++ load_TLS(thread, cpu); ++ ++ seg = thread->fsindex; ++ loadsegment(fs, seg); ++ savesegment(fs, seg); ++ ++ if (seg != thread->fsindex) { ++ pr_err("Fixup on FS loading exception: %i %i\n", ++ thread->fsindex, seg); ++ } ++ ++ if (core_entry.gpregs.fs_base) ++ wrmsrl(MSR_FS_BASE, core_entry.gpregs.fs_base); ++ ++ if (core_entry.gpregs.gs_base) ++ wrmsrl(MSR_GS_BASE, core_entry.gpregs.gs_base); ++ ++ put_cpu(); ++ ++ ret = 0; ++out: ++ if (fa) ++ flex_array_free(fa); ++ return ret; ++ ++out_unmap: ++ for (i = 0; i < nr_vma_mapped; i++) { ++ vma_entry_ptr = flex_array_get(fa, i); ++ down_write(¤t->mm->mmap_sem); ++ do_munmap(current->mm, vma_entry_ptr->start, ++ vma_entry_ptr->end - vma_entry_ptr->start); ++ up_write(¤t->mm->mmap_sem); ++ } ++ goto out; ++} ++#else ++static int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs, ++ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr) ++{ ++ return -ENOEXEC; ++} ++#endif ++ + static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + { + struct file *interpreter = NULL; /* to shut gcc up */ +@@ -592,7 +986,9 @@ static int load_elf_binary(struct linux_ + if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + goto out; + +- if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) ++ if (loc->elf_ex.e_type != ET_EXEC && ++ loc->elf_ex.e_type != ET_DYN && ++ loc->elf_ex.e_type != ET_CKPT) + goto out; + if (!elf_check_arch(&loc->elf_ex)) + goto out; +@@ -619,6 +1015,12 @@ static int load_elf_binary(struct linux_ + goto out_free_ph; + } + ++ if (loc->elf_ex.e_type == ET_CKPT) { ++ retval = load_elf_ckpt(bprm, regs, &loc->elf_ex, ++ (struct elf_phdr *)elf_phdata); ++ goto out_free_ph; ++ } ++ + elf_ppnt = elf_phdata; + elf_bss = 0; + elf_brk = 0; +Index: linux-2.6.git/include/linux/elf_ckpt.h +=================================================================== +--- /dev/null ++++ linux-2.6.git/include/linux/elf_ckpt.h +@@ -0,0 +1,135 @@ ++#ifndef _LINUX_ELF_CHECKPOINT_H ++#define _LINUX_ELF_CHECKPOINT_H ++ ++#include <linux/types.h> ++#include <linux/elf-em.h> ++ ++#ifdef __KERNEL__ ++ ++#include <asm/elf.h> ++ ++/* ++ * Elf extension includes new Elf file type ++ * and program header types as well. ++ */ ++#define ET_CKPT 5 ++ ++#define PT_CKPT_OFFSET 0x01010101 ++ ++#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1) ++#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2) ++#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3) ++ ++#define CKPT_PAGE_SIZE 4096 ++#define CKPT_GDT_ENTRY_TLS_ENTRIES 3 ++ ++#define HEADER_VERSION 1 ++#define HEADER_ARCH_X86_64 1 ++ ++#define VMA_AREA_REGULAR (1 << 0) ++#define VMA_AREA_STACK (1 << 1) ++#define VMA_AREA_VSYSCALL (1 << 2) ++#define VMA_AREA_VDSO (1 << 3) ++#define VMA_FORCE_READ (1 << 4) ++#define VMA_AREA_HEAP (1 << 5) ++#define VMA_FILE_PRIVATE (1 << 6) ++#define VMA_FILE_SHARED (1 << 7) ++#define VMA_ANON_SHARED (1 << 8) ++#define VMA_ANON_PRIVATE (1 << 9) ++#define VMA_FORCE_WRITE (1 << 10) ++ ++struct vma_entry { ++ __u64 start; ++ __u64 end; ++ __u64 pgoff; ++ __u32 prot; ++ __u32 flags; ++ __u32 status; ++ __u32 pid; ++ __s64 fd; ++ __u64 ino; ++ __u32 dev_maj; ++ __u32 dev_min; ++} __packed; ++ ++struct page_entry { ++ __u64 va; ++ __u8 data[CKPT_PAGE_SIZE]; ++} __packed; ++ ++struct image_header { ++ __u16 version; ++ __u16 arch; ++ __u32 flags; ++} __packed; ++ ++struct user_regs_entry { ++ __u64 r15; ++ __u64 r14; ++ __u64 r13; ++ __u64 r12; ++ __u64 bp; ++ __u64 bx; ++ __u64 r11; ++ __u64 r10; ++ __u64 r9; ++ __u64 r8; ++ __u64 ax; ++ __u64 cx; ++ __u64 dx; ++ __u64 si; ++ __u64 di; ++ __u64 orig_ax; ++ __u64 ip; ++ __u64 cs; ++ __u64 flags; ++ __u64 sp; ++ __u64 ss; ++ __u64 fs_base; ++ __u64 gs_base; ++ __u64 ds; ++ __u64 es; ++ __u64 fs; ++ __u64 gs; ++} __packed; ++ ++struct desc_struct_entry { ++ union { ++ struct { ++ __u32 a; ++ __u32 b; ++ }; ++ struct { ++ __u16 limit0; ++ __u16 base0; ++ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; ++ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; ++ }; ++ }; ++} __packed; ++ ++struct user_fpregs_entry { ++ __u16 cwd; ++ __u16 swd; ++ __u16 twd; ++ __u16 fop; ++ __u64 rip; ++ __u64 rdp; ++ __u32 mxcsr; ++ __u32 mxcsr_mask; ++ __u32 st_space[32]; ++ __u32 xmm_space[64]; ++ __u32 padding[24]; ++} __packed; ++ ++struct core_entry { ++ struct image_header header; ++ struct user_regs_entry gpregs; ++ struct user_fpregs_entry fpregs; ++ struct desc_struct tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES]; ++ __u32 personality; ++} __packed; ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _LINUX_ELF_CHECKPOINT_H */ diff --git a/kernel/cr-clone-with-pid-support b/kernel/cr-clone-with-pid-support new file mode 100644 index 000000000..cc4caf407 --- /dev/null +++ b/kernel/cr-clone-with-pid-support @@ -0,0 +1,172 @@ +Allow processes to be created with specified pid + +We will need it to restore processes so they would not +even notice that they were being checkpointed. + +Signed-off-by: Pavel Emelyanov <xemul@openvz.org> +--- + include/linux/pid.h | 2 - + include/linux/sched.h | 1 + kernel/fork.c | 10 ++++++- + kernel/pid.c | 70 ++++++++++++++++++++++++++++++++++++-------------- + 4 files changed, 62 insertions(+), 21 deletions(-) + +Index: linux-2.6.git/include/linux/pid.h +=================================================================== +--- linux-2.6.git.orig/include/linux/pid.h ++++ linux-2.6.git/include/linux/pid.h +@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr); + extern struct pid *find_ge_pid(int nr, struct pid_namespace *); + int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); + +-extern struct pid *alloc_pid(struct pid_namespace *ns); ++extern struct pid *alloc_pid(struct pid_namespace *ns, int pid); + extern void free_pid(struct pid *pid); + + /* +Index: linux-2.6.git/include/linux/sched.h +=================================================================== +--- linux-2.6.git.orig/include/linux/sched.h ++++ linux-2.6.git/include/linux/sched.h +@@ -23,6 +23,7 @@ + #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ + /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ ++#define CLONE_CHILD_USEPID 0x02000000 /* use the given pid */ + #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ + #define CLONE_NEWIPC 0x08000000 /* New ipcs */ + #define CLONE_NEWUSER 0x10000000 /* New user namespace */ +Index: linux-2.6.git/kernel/fork.c +=================================================================== +--- linux-2.6.git.orig/kernel/fork.c ++++ linux-2.6.git/kernel/fork.c +@@ -1239,8 +1239,16 @@ static struct task_struct *copy_process( + goto bad_fork_cleanup_io; + + if (pid != &init_struct_pid) { ++ int want_pid = 0; ++ ++ if (clone_flags & CLONE_CHILD_USEPID) { ++ retval = get_user(want_pid, child_tidptr); ++ if (retval) ++ goto bad_fork_cleanup_io; ++ } ++ + retval = -ENOMEM; +- pid = alloc_pid(p->nsproxy->pid_ns); ++ pid = alloc_pid(p->nsproxy->pid_ns, want_pid); + if (!pid) + goto bad_fork_cleanup_io; + } +Index: linux-2.6.git/kernel/pid.c +=================================================================== +--- linux-2.6.git.orig/kernel/pid.c ++++ linux-2.6.git/kernel/pid.c +@@ -159,11 +159,55 @@ static void set_last_pid(struct pid_name + } while ((prev != last_write) && (pid_before(base, last_write, pid))); + } + +-static int alloc_pidmap(struct pid_namespace *pid_ns) ++static int alloc_pidmap_page(struct pidmap *map) ++{ ++ if (unlikely(!map->page)) { ++ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); ++ /* ++ * Free the page if someone raced with us ++ * installing it: ++ */ ++ spin_lock_irq(&pidmap_lock); ++ if (!map->page) { ++ map->page = page; ++ page = NULL; ++ } ++ spin_unlock_irq(&pidmap_lock); ++ kfree(page); ++ if (unlikely(!map->page)) ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static int set_pidmap(struct pid_namespace *pid_ns, int pid) ++{ ++ int offset; ++ struct pidmap *map; ++ ++ offset = pid & BITS_PER_PAGE_MASK; ++ map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; ++ ++ if (alloc_pidmap_page(map) < 0) ++ return -ENOMEM; ++ ++ if (!test_and_set_bit(offset, map->page)) { ++ atomic_dec(&map->nr_free); ++ return pid; ++ } ++ ++ return -EBUSY; ++} ++ ++static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid) + { + int i, offset, max_scan, pid, last = pid_ns->last_pid; + struct pidmap *map; + ++ if (desired_pid) ++ return set_pidmap(pid_ns, desired_pid); ++ + pid = last + 1; + if (pid >= pid_max) + pid = RESERVED_PIDS; +@@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_names + */ + max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; + for (i = 0; i <= max_scan; ++i) { +- if (unlikely(!map->page)) { +- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); +- /* +- * Free the page if someone raced with us +- * installing it: +- */ +- spin_lock_irq(&pidmap_lock); +- if (!map->page) { +- map->page = page; +- page = NULL; +- } +- spin_unlock_irq(&pidmap_lock); +- kfree(page); +- if (unlikely(!map->page)) +- break; +- } ++ if (alloc_pidmap_page(map) < 0) ++ break; ++ + if (likely(atomic_read(&map->nr_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { +@@ -277,7 +308,7 @@ void free_pid(struct pid *pid) + call_rcu(&pid->rcu, delayed_put_pid); + } + +-struct pid *alloc_pid(struct pid_namespace *ns) ++struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid) + { + struct pid *pid; + enum pid_type type; +@@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespa + + tmp = ns; + for (i = ns->level; i >= 0; i--) { +- nr = alloc_pidmap(tmp); ++ nr = alloc_pidmap(tmp, this_ns_pid); + if (nr < 0) + goto out_free; + + pid->numbers[i].nr = nr; + pid->numbers[i].ns = tmp; + tmp = tmp->parent; ++ this_ns_pid = 0; + } + + get_pid_ns(ns); diff --git a/kernel/cr-proc-add-children b/kernel/cr-proc-add-children new file mode 100644 index 000000000..d307a6024 --- /dev/null +++ b/kernel/cr-proc-add-children @@ -0,0 +1,46 @@ +proc: Introduce the Children: line in /proc/<pid>/status + +From: Pavel Emelyanov <xemul@parallels.com> + +Although we can get the pids of some task's issue, this is just +more convenient to have them this way. + +Signed-off-by: Pavel Emelyanov <xemul@parallels.com> +Acked-by: Serge Hallyn <serge.hallyn@canonical.com> +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +--- + fs/proc/array.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +Index: linux-2.6.git/fs/proc/array.c +=================================================================== +--- linux-2.6.git.orig/fs/proc/array.c ++++ linux-2.6.git/fs/proc/array.c +@@ -158,6 +158,18 @@ static inline const char *get_task_state + return *p; + } + ++static void task_children(struct seq_file *m, struct task_struct *p, struct pid_namespace *ns) ++{ ++ struct task_struct *c; ++ ++ seq_printf(m, "Children:"); ++ read_lock(&tasklist_lock); ++ list_for_each_entry(c, &p->children, sibling) ++ seq_printf(m, " %d", pid_nr_ns(task_pid(c), ns)); ++ read_unlock(&tasklist_lock); ++ seq_putc(m, '\n'); ++} ++ + static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p) + { +@@ -192,6 +204,8 @@ static inline void task_state(struct seq + cred->uid, cred->euid, cred->suid, cred->fsuid, + cred->gid, cred->egid, cred->sgid, cred->fsgid); + ++ task_children(m, p, ns); ++ + task_lock(p); + if (p->files) + fdt = files_fdtable(p->files); diff --git a/kernel/cr-proc-map-files-21 b/kernel/cr-proc-map-files-21 new file mode 100644 index 000000000..ccf8fbf94 --- /dev/null +++ b/kernel/cr-proc-map-files-21 @@ -0,0 +1,522 @@ +fs, proc: Introduce the /proc/<pid>/map_files/ directory v14 + +From: Pavel Emelyanov <xemul@parallels.com> + +This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks +one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", +the target is the file. Opening a symlink results in a file that point exactly +to the same inode as them vma's one. + +For example the ls -l of some arbitrary /proc/<pid>/map_files/ + + | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so + | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 + | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 + | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so + | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so + +This *helps* checkpointing process in three ways: + +1. When dumping a task mappings we do know exact file that is mapped by particular + region. We do this by opening /proc/$pid/map_files/$address symlink the way we do + with file descriptors. + +2. This also helps in determining which anonymous shared mappings are shared with + each other by comparing the inodes of them. + +3. When restoring a set of processes in case two of them has a mapping shared, we map + the memory by the 1st one and then open its /proc/$pid/map_files/$address file and + map it by the 2nd task. + +Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable +re-reading and reparsing for this text file which slows down restore procedure +significantly. Also as being pointed in (3) it is a way easier to use top level +shared mapping in children as /proc/$pid/map_files/$address when needed. + +v2: (spotted by Tejun Heo) + - /proc/<pid>/mfd changed to /proc/<pid>/map_files + - find_vma helper is used instead of linear search + - routines are re-grouped + - d_revalidate is set now + +v3: + - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo) + - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov) + - because of filldir (which eventually might need to lock mmap_sem) + the proc_map_files_readdir() was reworked to call proc_fill_cache() + with unlocked mmap_sem + +v4: (feedback by Tejun Heo and Vasiliy Kulikov) + - instead of saving data in proc_inode we rather make a dentry name + to keep both vm_start and vm_end accordingly + - d_revalidate now honor task credentials + +v5: (feedback by Kirill A. Shutemov) + - don't forget to release mmap_sem on error path + +v6: + - sizeof get used in map_files_info which shrink member a bit on + x86-32 (by Kirill A. Shutemov) + - map_name_to_addr returns -EINVAL instead of -1 + which is more appropriate (by Tejun Heo) + +v7: + - add [get/set]attr handlers for + proc_map_files_inode_operations (by Vasiliy Kulikov) + +v8: + - Kirill A. Shutemov spotted a parasite semicolon + which ruined the ptrace_check call, fixed. + +v9: (feedback by Andrew Morton) + - find_exact_vma moved into include/linux/mm.h as an inline helper + - proc_map_files_setattr uses either kmalloc or vmalloc depending + on how many objects are to be allocated + - no more map_name_to_addr but dname_to_vma_addr introduced instead + and it uses sscanf because in one case the find_exact_vma() is used + only to confirm existence of vma area the boolean flag is used + - fancy justification dropped + - still the proc_map_files_get/setattr leaved untouched + until additional fd/ patches applied first. + +v10: (feedback by Andrew Morton) + - flex_arrays are used instead of kmalloc/vmalloc calls + - map_files_d_revalidate use ptrace_may_access for + security reason (by Vasiliy Kulikov) + +v11: + - should use fput and drop !ret test from a loop code + (feedback by Andrew Morton) + - no need for 'used' variable, use existing + nr_files with file->pos predicate + - if preallocation fails no need to go further, + simply release mmap semaphore and jump out + +v12: + - rework map_files_d_revalidate to make sure + the task get released on return (by Vasiliy Kulikov) + +v13: + - proc_map_files_inode_operations are set to be the same + as proc_fd_inode_operations, ie to include .permission + pointing to proc_fd_permission + +v14: (by Vasiliy Kulikov) + - for security reason map_files/ entries are allowed for + readers with CAP_SYS_ADMIN credentials granted only + +Signed-off-by: Pavel Emelyanov <xemul@parallels.com> +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +Reviewed-by: Vasiliy Kulikov <segoon@openwall.com> +CC: Tejun Heo <tj@kernel.org> +CC: Vasiliy Kulikov <segoon@openwall.com> +CC: "Kirill A. Shutemov" <kirill@shutemov.name> +CC: Alexey Dobriyan <adobriyan@gmail.com> +CC: Al Viro <viro@ZenIV.linux.org.uk> +CC: Andrew Morton <akpm@linux-foundation.org> +CC: Pavel Machek <pavel@ucw.cz> +--- + fs/proc/base.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + include/linux/mm.h | 12 + + 2 files changed, 357 insertions(+) + +Index: linux-2.6.git/fs/proc/base.c +=================================================================== +--- linux-2.6.git.orig/fs/proc/base.c ++++ linux-2.6.git/fs/proc/base.c +@@ -83,6 +83,7 @@ + #include <linux/pid_namespace.h> + #include <linux/fs_struct.h> + #include <linux/slab.h> ++#include <linux/flex_array.h> + #ifdef CONFIG_HARDWALL + #include <asm/hardwall.h> + #endif +@@ -133,6 +134,8 @@ struct pid_entry { + NULL, &proc_single_file_operations, \ + { .proc_show = show } ) + ++static int proc_fd_permission(struct inode *inode, int mask); ++ + /* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. +@@ -2201,6 +2204,347 @@ static const struct file_operations proc + }; + + /* ++ * dname_to_vma_addr - maps a dentry name into two unsigned longs ++ * which represent vma start and end addresses. ++ */ ++static int dname_to_vma_addr(struct dentry *dentry, ++ unsigned long *start, unsigned long *end) ++{ ++ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) ++{ ++ unsigned long vm_start, vm_end; ++ bool exact_vma_exists = false; ++ struct mm_struct *mm = NULL; ++ struct task_struct *task; ++ const struct cred *cred; ++ struct inode *inode; ++ int status = 0; ++ ++ if (nd && nd->flags & LOOKUP_RCU) ++ return -ECHILD; ++ ++ if (!capable(CAP_SYS_ADMIN)) { ++ status = -EACCES; ++ goto out_notask; ++ } ++ ++ inode = dentry->d_inode; ++ task = get_proc_task(inode); ++ if (!task) ++ goto out_notask; ++ ++ if (!ptrace_may_access(task, PTRACE_MODE_READ)) ++ goto out; ++ ++ mm = get_task_mm(task); ++ if (!mm) ++ goto out; ++ ++ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { ++ down_read(&mm->mmap_sem); ++ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); ++ up_read(&mm->mmap_sem); ++ } ++ ++ mmput(mm); ++ ++ if (exact_vma_exists) { ++ if (task_dumpable(task)) { ++ rcu_read_lock(); ++ cred = __task_cred(task); ++ inode->i_uid = cred->euid; ++ inode->i_gid = cred->egid; ++ rcu_read_unlock(); ++ } else { ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ } ++ security_task_to_inode(task, inode); ++ status = 1; ++ } ++ ++out: ++ put_task_struct(task); ++ ++out_notask: ++ if (status <= 0) ++ d_drop(dentry); ++ ++ return status; ++} ++ ++static const struct dentry_operations tid_map_files_dentry_operations = { ++ .d_revalidate = map_files_d_revalidate, ++ .d_delete = pid_delete_dentry, ++}; ++ ++static int proc_map_files_get_link(struct dentry *dentry, struct path *path) ++{ ++ unsigned long vm_start, vm_end; ++ struct vm_area_struct *vma; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ int rc; ++ ++ rc = -ENOENT; ++ task = get_proc_task(dentry->d_inode); ++ if (!task) ++ goto out; ++ ++ mm = get_task_mm(task); ++ put_task_struct(task); ++ if (!mm) ++ goto out; ++ ++ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); ++ if (rc) ++ goto out_mmput; ++ ++ down_read(&mm->mmap_sem); ++ vma = find_exact_vma(mm, vm_start, vm_end); ++ if (vma && vma->vm_file) { ++ *path = vma->vm_file->f_path; ++ path_get(path); ++ rc = 0; ++ } ++ up_read(&mm->mmap_sem); ++ ++out_mmput: ++ mmput(mm); ++out: ++ return rc; ++} ++ ++struct map_files_info { ++ struct file *file; ++ unsigned long len; ++ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ ++}; ++ ++static struct dentry * ++proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, ++ struct task_struct *task, const void *ptr) ++{ ++ const struct file *file = ptr; ++ struct proc_inode *ei; ++ struct inode *inode; ++ ++ if (!file) ++ return ERR_PTR(-ENOENT); ++ ++ inode = proc_pid_make_inode(dir->i_sb, task); ++ if (!inode) ++ return ERR_PTR(-ENOENT); ++ ++ ei = PROC_I(inode); ++ ei->op.proc_get_link = proc_map_files_get_link; ++ ++ inode->i_op = &proc_pid_link_inode_operations; ++ inode->i_size = 64; ++ inode->i_mode = S_IFLNK; ++ ++ if (file->f_mode & FMODE_READ) ++ inode->i_mode |= S_IRUSR; ++ if (file->f_mode & FMODE_WRITE) ++ inode->i_mode |= S_IWUSR; ++ ++ d_set_d_op(dentry, &tid_map_files_dentry_operations); ++ d_add(dentry, inode); ++ ++ return NULL; ++} ++ ++static struct dentry *proc_map_files_lookup(struct inode *dir, ++ struct dentry *dentry, struct nameidata *nd) ++{ ++ unsigned long vm_start, vm_end; ++ struct vm_area_struct *vma; ++ struct task_struct *task; ++ struct dentry *result; ++ struct mm_struct *mm; ++ ++ result = ERR_PTR(-EACCES); ++ if (!capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ result = ERR_PTR(-ENOENT); ++ task = get_proc_task(dir); ++ if (!task) ++ goto out; ++ ++ result = ERR_PTR(-EACCES); ++ if (lock_trace(task)) ++ goto out_put_task; ++ ++ result = ERR_PTR(-ENOENT); ++ if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) ++ goto out_unlock; ++ ++ mm = get_task_mm(task); ++ if (!mm) ++ goto out_unlock; ++ ++ down_read(&mm->mmap_sem); ++ vma = find_exact_vma(mm, vm_start, vm_end); ++ if (!vma) ++ goto out_no_vma; ++ ++ result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); ++ ++out_no_vma: ++ up_read(&mm->mmap_sem); ++ mmput(mm); ++out_unlock: ++ unlock_trace(task); ++out_put_task: ++ put_task_struct(task); ++out: ++ return result; ++} ++ ++static const struct inode_operations proc_map_files_inode_operations = { ++ .lookup = proc_map_files_lookup, ++ .permission = proc_fd_permission, ++ .setattr = proc_setattr, ++}; ++ ++static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) ++{ ++ struct dentry *dentry = filp->f_path.dentry; ++ struct inode *inode = dentry->d_inode; ++ struct vm_area_struct *vma; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ ino_t ino; ++ int ret; ++ ++ ret = -EACCES; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ ret = -ENOENT; ++ task = get_proc_task(inode); ++ if (!task) ++ goto out; ++ ++ ret = -EACCES; ++ if (lock_trace(task)) ++ goto out_put_task; ++ ++ ret = 0; ++ switch (filp->f_pos) { ++ case 0: ++ ino = inode->i_ino; ++ if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) ++ goto out_unlock; ++ filp->f_pos++; ++ case 1: ++ ino = parent_ino(dentry); ++ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) ++ goto out_unlock; ++ filp->f_pos++; ++ default: ++ { ++ unsigned long nr_files, pos, i; ++ struct flex_array *fa = NULL; ++ struct map_files_info info; ++ struct map_files_info *p; ++ ++ mm = get_task_mm(task); ++ if (!mm) ++ goto out_unlock; ++ down_read(&mm->mmap_sem); ++ ++ nr_files = 0; ++ ++ /* ++ * We need two passes here: ++ * ++ * 1) Collect vmas of mapped files with mmap_sem taken ++ * 2) Release mmap_sem and instantiate entries ++ * ++ * otherwise we get lockdep complained, since filldir() ++ * routine might require mmap_sem taken in might_fault(). ++ */ ++ ++ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { ++ if (vma->vm_file && ++pos > filp->f_pos) ++ nr_files++; ++ } ++ ++ if (nr_files) { ++ fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL); ++ if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) { ++ ret = -ENOMEM; ++ if (fa) ++ flex_array_free(fa); ++ up_read(&mm->mmap_sem); ++ mmput(mm); ++ goto out_unlock; ++ } ++ for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { ++ if (!vma->vm_file) ++ continue; ++ if (++pos <= filp->f_pos) ++ continue; ++ ++ get_file(vma->vm_file); ++ info.file = vma->vm_file; ++ info.len = snprintf(info.name, sizeof(info.name), ++ "%lx-%lx", vma->vm_start, ++ vma->vm_end); ++ if (flex_array_put(fa, i++, &info, GFP_KERNEL)) ++ BUG(); ++ } ++ } ++ up_read(&mm->mmap_sem); ++ ++ for (i = 0; i < nr_files; i++) { ++ p = flex_array_get(fa, i); ++ ret = proc_fill_cache(filp, dirent, filldir, ++ p->name, p->len, ++ proc_map_files_instantiate, ++ task, p->file); ++ if (ret) ++ break; ++ filp->f_pos++; ++ fput(p->file); ++ } ++ for (; i < nr_files; i++) { ++ /* ++ * In case of error don't forget ++ * to put rest of file refs. ++ */ ++ p = flex_array_get(fa, i); ++ fput(p->file); ++ } ++ if (fa) ++ flex_array_free(fa); ++ mmput(mm); ++ } ++ } ++ ++out_unlock: ++ unlock_trace(task); ++out_put_task: ++ put_task_struct(task); ++out: ++ return ret; ++} ++ ++static const struct file_operations proc_map_files_operations = { ++ .read = generic_read_dir, ++ .readdir = proc_map_files_readdir, ++ .llseek = default_llseek, ++}; ++ ++/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +@@ -2815,6 +3159,7 @@ static const struct inode_operations pro + static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), ++ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), + #ifdef CONFIG_NET +Index: linux-2.6.git/include/linux/mm.h +=================================================================== +--- linux-2.6.git.orig/include/linux/mm.h ++++ linux-2.6.git/include/linux/mm.h +@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st + return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + } + ++/* Look up the first VMA which exactly match the interval vm_start ... vm_end */ ++static inline struct vm_area_struct * ++find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) ++{ ++ struct vm_area_struct *vma = find_vma(mm, vm_start); ++ ++ if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) ++ vma = NULL; ++ ++ return vma; ++} ++ + #ifdef CONFIG_MMU + pgprot_t vm_get_page_prot(unsigned long vm_flags); + #else diff --git a/kernel/cr-statfs-callback-for-pipefs b/kernel/cr-statfs-callback-for-pipefs new file mode 100644 index 000000000..6fae692af --- /dev/null +++ b/kernel/cr-statfs-callback-for-pipefs @@ -0,0 +1,27 @@ +vfs: Add ->statfs callback for pipefs + +From: Pavel Emelyanov <xemul@parallels.com> + +This is done to make it possible to distinguish pipes +from fifos when opening one via /proc/<pid>/fd/ link. + +Signed-off-by: Pavel Emelyanov <xemul@parallels.com> +Reviewed-by: Tejun Heo <tj@kernel.org> +Acked-by: Serge Hallyn <serge.hallyn@canonical.com> +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +--- + fs/pipe.c | 1 + + 1 file changed, 1 insertion(+) + +Index: linux-2.6.git/fs/pipe.c +=================================================================== +--- linux-2.6.git.orig/fs/pipe.c ++++ linux-2.6.git/fs/pipe.c +@@ -1254,6 +1254,7 @@ out: + + static const struct super_operations pipefs_ops = { + .destroy_inode = free_inode_nonrcu, ++ .statfs = simple_statfs, + }; + + /* diff --git a/kernel/fs-add-do-close b/kernel/fs-add-do-close new file mode 100644 index 000000000..d19ea6ae0 --- /dev/null +++ b/kernel/fs-add-do-close @@ -0,0 +1,86 @@ +fs: Add do_close helper + +To be able to close file descriptors right from inside +kernel space do_close() helper is added. We need it at +checkpoint restore time. + +Signed-off-by: Pavel Emelyanov <xemul@parallels.com> +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +--- + fs/open.c | 32 ++++++++++++++++++++------------ + include/linux/fs.h | 1 + + 2 files changed, 21 insertions(+), 12 deletions(-) + +Index: linux-2.6.git/fs/open.c +=================================================================== +--- linux-2.6.git.orig/fs/open.c ++++ linux-2.6.git/fs/open.c +@@ -1056,17 +1056,11 @@ int filp_close(struct file *filp, fl_own + + EXPORT_SYMBOL(filp_close); + +-/* +- * Careful here! We test whether the file pointer is NULL before +- * releasing the fd. This ensures that one clone task can't release +- * an fd while another clone is opening it. +- */ +-SYSCALL_DEFINE1(close, unsigned int, fd) ++int do_close(unsigned int fd) + { + struct file * filp; + struct files_struct *files = current->files; + struct fdtable *fdt; +- int retval; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); +@@ -1079,7 +1073,25 @@ SYSCALL_DEFINE1(close, unsigned int, fd) + FD_CLR(fd, fdt->close_on_exec); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); +- retval = filp_close(filp, files); ++ ++ return filp_close(filp, files); ++ ++out_unlock: ++ spin_unlock(&files->file_lock); ++ return -EBADF; ++} ++EXPORT_SYMBOL_GPL(do_close); ++ ++/* ++ * Careful here! We test whether the file pointer is NULL before ++ * releasing the fd. This ensures that one clone task can't release ++ * an fd while another clone is opening it. ++ */ ++SYSCALL_DEFINE1(close, unsigned int, fd) ++{ ++ int retval; ++ ++ retval = do_close(fd); + + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || +@@ -1089,10 +1101,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) + retval = -EINTR; + + return retval; +- +-out_unlock: +- spin_unlock(&files->file_lock); +- return -EBADF; + } + EXPORT_SYMBOL(sys_close); + +Index: linux-2.6.git/include/linux/fs.h +=================================================================== +--- linux-2.6.git.orig/include/linux/fs.h ++++ linux-2.6.git/include/linux/fs.h +@@ -2027,6 +2027,7 @@ extern struct file *file_open_root(struc + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, + const struct cred *); + extern int filp_close(struct file *, fl_owner_t id); ++extern int do_close(unsigned int fd); + extern char * getname(const char __user *); + + /* fs/ioctl.c */ diff --git a/kernel/fs-proc-add-tls b/kernel/fs-proc-add-tls new file mode 100644 index 000000000..eb0d9f620 --- /dev/null +++ b/kernel/fs-proc-add-tls @@ -0,0 +1,45 @@ +fs, proc: Add /proc/$pid/tls entry + +To be able to restart checkpointed tasks we need +to know TLS status at dumping time. Export this +information by /proc/$pid/tls entry. + +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +--- + fs/proc/base.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +Index: linux-2.6.git/fs/proc/base.c +=================================================================== +--- linux-2.6.git.orig/fs/proc/base.c ++++ linux-2.6.git/fs/proc/base.c +@@ -3150,6 +3150,21 @@ static int proc_pid_personality(struct s + return err; + } + ++static int proc_pid_tls(struct seq_file *m, struct pid_namespace *ns, ++ struct pid *pid, struct task_struct *task) ++{ ++ int err = lock_trace(task); ++ if (!err) { ++ int i; ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) ++ seq_printf(m, "%x %x\n", ++ task->thread.tls_array[i].a, ++ task->thread.tls_array[i].b); ++ unlock_trace(task); ++ } ++ return err; ++} ++ + /* + * Thread groups + */ +@@ -3169,6 +3184,7 @@ static const struct pid_entry tgid_base_ + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUGO, proc_pid_personality), ++ ONE("tls", S_IRUGO, proc_pid_tls), + INF("limits", S_IRUGO, proc_pid_limits), + #ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), diff --git a/kernel/fs-proc-switch-to-dentry b/kernel/fs-proc-switch-to-dentry new file mode 100644 index 000000000..4f29d286e --- /dev/null +++ b/kernel/fs-proc-switch-to-dentry @@ -0,0 +1,108 @@ +fs, proc: Make proc_get_link to use dentry instead of inode + +This patch prepares the ground for the next "map_files" +patch which needs a name of a link file to analyse. + +So instead of squashing this change into one big +patch the separate one is done. + +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +CC: Pavel Emelyanov <xemul@parallels.com> +CC: Tejun Heo <tj@kernel.org> +CC: Vasiliy Kulikov <segoon@openwall.com> +CC: "Kirill A. Shutemov" <kirill@shutemov.name> +CC: Alexey Dobriyan <adobriyan@gmail.com> +CC: Al Viro <viro@ZenIV.linux.org.uk> +CC: Andrew Morton <akpm@linux-foundation.org> +--- + fs/proc/base.c | 20 ++++++++++---------- + include/linux/proc_fs.h | 2 +- + 2 files changed, 11 insertions(+), 11 deletions(-) + +Index: linux-2.6.git/fs/proc/base.c +=================================================================== +--- linux-2.6.git.orig/fs/proc/base.c ++++ linux-2.6.git/fs/proc/base.c +@@ -165,9 +165,9 @@ static int get_task_root(struct task_str + return result; + } + +-static int proc_cwd_link(struct inode *inode, struct path *path) ++static int proc_cwd_link(struct dentry *dentry, struct path *path) + { +- struct task_struct *task = get_proc_task(inode); ++ struct task_struct *task = get_proc_task(dentry->d_inode); + int result = -ENOENT; + + if (task) { +@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i + return result; + } + +-static int proc_root_link(struct inode *inode, struct path *path) ++static int proc_root_link(struct dentry *dentry, struct path *path) + { +- struct task_struct *task = get_proc_task(inode); ++ struct task_struct *task = get_proc_task(dentry->d_inode); + int result = -ENOENT; + + if (task) { +@@ -1580,13 +1580,13 @@ static const struct file_operations proc + .release = single_release, + }; + +-static int proc_exe_link(struct inode *inode, struct path *exe_path) ++static int proc_exe_link(struct dentry *dentry, struct path *exe_path) + { + struct task_struct *task; + struct mm_struct *mm; + struct file *exe_file; + +- task = get_proc_task(inode); ++ task = get_proc_task(dentry->d_inode); + if (!task) + return -ENOENT; + mm = get_task_mm(task); +@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct + if (!proc_fd_access_allowed(inode)) + goto out; + +- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); ++ error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); + out: + return ERR_PTR(error); + } +@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent + if (!proc_fd_access_allowed(inode)) + goto out; + +- error = PROC_I(inode)->op.proc_get_link(inode, &path); ++ error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + +@@ -1959,9 +1959,9 @@ out_task: + return rc; + } + +-static int proc_fd_link(struct inode *inode, struct path *path) ++static int proc_fd_link(struct dentry *dentry, struct path *path) + { +- return proc_fd_info(inode, path, NULL); ++ return proc_fd_info(dentry->d_inode, path, NULL); + } + + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) +Index: linux-2.6.git/include/linux/proc_fs.h +=================================================================== +--- linux-2.6.git.orig/include/linux/proc_fs.h ++++ linux-2.6.git/include/linux/proc_fs.h +@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u + extern const struct proc_ns_operations ipcns_operations; + + union proc_op { +- int (*proc_get_link)(struct inode *, struct path *); ++ int (*proc_get_link)(struct dentry *, struct path *); + int (*proc_read)(struct task_struct *task, char *page); + int (*proc_show)(struct seq_file *m, + struct pid_namespace *ns, struct pid *pid, diff --git a/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch b/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch new file mode 100644 index 000000000..70d259330 --- /dev/null +++ b/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch @@ -0,0 +1,28 @@ +From: Vasiliy Kulikov <segooon@gmail.com> + +In the patch "proc: fix races against execve() of /proc/PID/fd**" +proc_pid_fd_link_getattr() leaked task_struct if ptrace check fails. + +Signed-off-by: Vasiliy Kulikov <segoon@openwall.com> +Reported-by: Cyrill Gorcunov <gorcunov@gmail.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + fs/proc/base.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix fs/proc/base.c +--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix ++++ a/fs/proc/base.c +@@ -1681,9 +1681,9 @@ static int proc_pid_fd_link_getattr(stru + + generic_fillattr(inode, stat); + unlock_trace(task); +- put_task_struct(task); + rc = 0; + out_task: ++ put_task_struct(task); + return rc; + } + +_ diff --git a/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch b/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch new file mode 100644 index 000000000..8c2a4a18a --- /dev/null +++ b/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch @@ -0,0 +1,255 @@ +From: Vasiliy Kulikov <segoon@openwall.com> + +fd* files are restricted to the task's owner, and other users may not get +direct access to them. But one may open any of these files and run any +setuid program, keeping opened file descriptors. As there are permission +checks on open(), but not on readdir() and read(), operations on the kept +file descriptors will not be checked. It makes it possible to violate +procfs permission model. + +Reading fdinfo/* may disclosure current fds' position and flags, reading +directory contents of fdinfo/ and fd/ may disclosure the number of opened +files by the target task. This information is not sensible per se, but it +can reveal some private information (like length of a password stored in a +file) under certain conditions. + +Used existing (un)lock_trace functions to check for ptrace_may_access(), +but instead of using EPERM return code from it use EACCES to be consistent +with existing proc_pid_follow_link()/proc_pid_readlink() return code. If +they differ, attacker can guess what fds exist by analyzing stat() return +code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*, +readdir() and lookup() for fd/ and fdinfo/. + +Signed-off-by: Vasiliy Kulikov <segoon@openwall.com> +Cc: Cyrill Gorcunov <gorcunov@gmail.com> +Cc: <stable@kernel.org> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + fs/proc/base.c | 146 +++++++++++++++++++++++++++++++++-------------- + 1 file changed, 103 insertions(+), 43 deletions(-) + +diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd fs/proc/base.c +--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd ++++ a/fs/proc/base.c +@@ -1652,12 +1652,46 @@ out: + return error; + } + ++static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct inode *inode = dentry->d_inode; ++ struct task_struct *task = get_proc_task(inode); ++ int rc; ++ ++ if (task == NULL) ++ return -ESRCH; ++ ++ rc = -EACCES; ++ if (lock_trace(task)) ++ goto out_task; ++ ++ generic_fillattr(inode, stat); ++ unlock_trace(task); ++ put_task_struct(task); ++ rc = 0; ++out_task: ++ return rc; ++} ++ + static const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, + }; + ++static const struct inode_operations proc_fdinfo_link_inode_operations = { ++ .setattr = proc_setattr, ++ .getattr = proc_pid_fd_link_getattr, ++}; ++ ++static const struct inode_operations proc_fd_link_inode_operations = { ++ .readlink = proc_pid_readlink, ++ .follow_link = proc_pid_follow_link, ++ .setattr = proc_setattr, ++ .getattr = proc_pid_fd_link_getattr, ++}; ++ + + /* building an inode */ + +@@ -1889,49 +1923,61 @@ out: + + static int proc_fd_info(struct inode *inode, struct path *path, char *info) + { +- struct task_struct *task = get_proc_task(inode); +- struct files_struct *files = NULL; ++ struct task_struct *task; ++ struct files_struct *files; + struct file *file; + int fd = proc_fd(inode); ++ int rc; + +- if (task) { +- files = get_files_struct(task); +- put_task_struct(task); +- } +- if (files) { +- /* +- * We are not taking a ref to the file structure, so we must +- * hold ->file_lock. +- */ +- spin_lock(&files->file_lock); +- file = fcheck_files(files, fd); +- if (file) { +- unsigned int f_flags; +- struct fdtable *fdt; +- +- fdt = files_fdtable(files); +- f_flags = file->f_flags & ~O_CLOEXEC; +- if (FD_ISSET(fd, fdt->close_on_exec)) +- f_flags |= O_CLOEXEC; +- +- if (path) { +- *path = file->f_path; +- path_get(&file->f_path); +- } +- if (info) +- snprintf(info, PROC_FDINFO_MAX, +- "pos:\t%lli\n" +- "flags:\t0%o\n", +- (long long) file->f_pos, +- f_flags); +- spin_unlock(&files->file_lock); +- put_files_struct(files); +- return 0; ++ task = get_proc_task(inode); ++ if (!task) ++ return -ENOENT; ++ ++ rc = -EACCES; ++ if (lock_trace(task)) ++ goto out_task; ++ ++ rc = -ENOENT; ++ files = get_files_struct(task); ++ if (files == NULL) ++ goto out_unlock; ++ ++ /* ++ * We are not taking a ref to the file structure, so we must ++ * hold ->file_lock. ++ */ ++ spin_lock(&files->file_lock); ++ file = fcheck_files(files, fd); ++ if (file) { ++ unsigned int f_flags; ++ struct fdtable *fdt; ++ ++ fdt = files_fdtable(files); ++ f_flags = file->f_flags & ~O_CLOEXEC; ++ if (FD_ISSET(fd, fdt->close_on_exec)) ++ f_flags |= O_CLOEXEC; ++ ++ if (path) { ++ *path = file->f_path; ++ path_get(&file->f_path); + } +- spin_unlock(&files->file_lock); +- put_files_struct(files); +- } +- return -ENOENT; ++ if (info) ++ snprintf(info, PROC_FDINFO_MAX, ++ "pos:\t%lli\n" ++ "flags:\t0%o\n", ++ (long long) file->f_pos, ++ f_flags); ++ rc = 0; ++ } else ++ rc = -ENOENT; ++ spin_unlock(&files->file_lock); ++ put_files_struct(files); ++ ++out_unlock: ++ unlock_trace(task); ++out_task: ++ put_task_struct(task); ++ return rc; + } + + static int proc_fd_link(struct inode *inode, struct path *path) +@@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiat + spin_unlock(&files->file_lock); + put_files_struct(files); + +- inode->i_op = &proc_pid_link_inode_operations; ++ inode->i_op = &proc_fd_link_inode_operations; + inode->i_size = 64; + ei->op.proc_get_link = proc_fd_link; + d_set_d_op(dentry, &tid_fd_dentry_operations); +@@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_comm + if (fd == ~0U) + goto out; + ++ result = ERR_PTR(-EACCES); ++ if (lock_trace(task)) ++ goto out; ++ + result = instantiate(dir, dentry, task, &fd); ++ unlock_trace(task); + out: + put_task_struct(task); + out_no_task: +@@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct fil + retval = -ENOENT; + if (!p) + goto out_no_task; ++ ++ retval = -EACCES; ++ if (lock_trace(p)) ++ goto out; ++ + retval = 0; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) +- goto out; ++ goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) +- goto out; ++ goto out_unlock; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) +- goto out; ++ goto out_unlock; + rcu_read_lock(); + for (fd = filp->f_pos-2; + fd < files_fdtable(files)->max_fds; +@@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct fil + rcu_read_unlock(); + put_files_struct(files); + } ++ ++out_unlock: ++ unlock_trace(p); + out: + put_task_struct(p); + out_no_task: +@@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instan + ei->fd = fd; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; ++ inode->i_op = &proc_fdinfo_link_inode_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ +_ diff --git a/kernel/proc-force-dcache-drop-on-unauthorized-access.patch b/kernel/proc-force-dcache-drop-on-unauthorized-access.patch new file mode 100644 index 000000000..bfe6bf1a8 --- /dev/null +++ b/kernel/proc-force-dcache-drop-on-unauthorized-access.patch @@ -0,0 +1,118 @@ +From: Vasiliy Kulikov <segoon@openwall.com> + +The patch "proc: fix races against execve() of /proc/PID/fd**" is still a +partial fix for a setxid problem. link(2) is a yet another way to +identify whether a specific fd is opened by a privileged process. By +calling link(2) against /proc/PID/fd/* an attacker may identify whether +the fd number is valid for PID by analysing link(2) return code. + +Both getattr() and link() can be used by the attacker iff the dentry is +present in the dcache. In this case ->lookup() is not called and the only +way to check ptrace permissions is either operation handler or +->revalidate(). The easiest solution to prevent any unauthorized access +to /proc/PID/fd*/ files is to force the dentry drop on each unauthorized +access attempt. + +If an attacker keeps opened fd of /proc/PID/fd/ and dcache contains a +specific dentry for some /proc/PID/fd/XXX, any future attemp to use the +dentry by the attacker would lead to the dentry drop as a result of a +failed ptrace check in ->revalidate(). Then the attacker cannot spawn a +dentry for the specific fd number because of ptrace check in ->lookup(). + +The dentry drop can be still observed by an attacker by analysing +information from /proc/slabinfo, which is addressed in the successive +patch. + +Signed-off-by: Vasiliy Kulikov <segoon@openwall.com> +Cc: Cyrill Gorcunov <gorcunov@gmail.com> +Cc: Al Viro <viro@zeniv.linux.org.uk> +Cc: Christoph Lameter <cl@linux-foundation.org> +Cc: Pekka Enberg <penberg@kernel.org> +Cc: Matt Mackall <mpm@selenic.com> +Cc: Alexey Dobriyan <adobriyan@gmail.com> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + fs/proc/base.c | 42 ++++++------------------------------------ + 1 file changed, 6 insertions(+), 36 deletions(-) + +Index: linux-2.6.git/fs/proc/base.c +=================================================================== +--- linux-2.6.git.orig/fs/proc/base.c ++++ linux-2.6.git/fs/proc/base.c +@@ -1665,46 +1665,12 @@ out: + return error; + } + +-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, +- struct kstat *stat) +-{ +- struct inode *inode = dentry->d_inode; +- struct task_struct *task = get_proc_task(inode); +- int rc; +- +- if (task == NULL) +- return -ESRCH; +- +- rc = -EACCES; +- if (lock_trace(task)) +- goto out_task; +- +- generic_fillattr(inode, stat); +- unlock_trace(task); +- rc = 0; +-out_task: +- put_task_struct(task); +- return rc; +-} +- + static const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, + }; + +-static const struct inode_operations proc_fdinfo_link_inode_operations = { +- .setattr = proc_setattr, +- .getattr = proc_pid_fd_link_getattr, +-}; +- +-static const struct inode_operations proc_fd_link_inode_operations = { +- .readlink = proc_pid_readlink, +- .follow_link = proc_pid_follow_link, +- .setattr = proc_setattr, +- .getattr = proc_pid_fd_link_getattr, +-}; +- + + /* building an inode */ + +@@ -2013,6 +1979,11 @@ static int tid_fd_revalidate(struct dent + task = get_proc_task(inode); + fd = proc_fd(inode); + ++ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { ++ put_task_struct(task); ++ task = NULL; ++ } ++ + if (task) { + files = get_files_struct(task); + if (files) { +@@ -2085,7 +2056,7 @@ static struct dentry *proc_fd_instantiat + spin_unlock(&files->file_lock); + put_files_struct(files); + +- inode->i_op = &proc_fd_link_inode_operations; ++ inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + ei->op.proc_get_link = proc_fd_link; + d_set_d_op(dentry, &tid_fd_dentry_operations); +@@ -2267,7 +2238,6 @@ static struct dentry *proc_fdinfo_instan + ei->fd = fd; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; +- inode->i_op = &proc_fdinfo_link_inode_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ diff --git a/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch b/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch new file mode 100644 index 000000000..b65897617 --- /dev/null +++ b/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch @@ -0,0 +1,26 @@ +From: Pavel Emelyanov <xemul@openvz.org> + +On reading sysctl dirs we should return -EISDIR instead of -EINVAL. + +Signed-off-by: Pavel Emelyanov <xemul@openvz.org> +Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> +Cc: Alexey Dobriyan <adobriyan@gmail.com> +Cc: Al Viro <viro@ZenIV.linux.org.uk> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +--- + + fs/proc/proc_sysctl.c | 1 + + 1 file changed, 1 insertion(+) + +diff -puN fs/proc/proc_sysctl.c~procfs-report-eisdir-when-reading-sysctl-dirs-in-proc fs/proc/proc_sysctl.c +--- a/fs/proc/proc_sysctl.c~procfs-report-eisdir-when-reading-sysctl-dirs-in-proc ++++ a/fs/proc/proc_sysctl.c +@@ -360,6 +360,7 @@ static const struct file_operations proc + }; + + static const struct file_operations proc_sys_dir_file_operations = { ++ .read = generic_read_dir, + .readdir = proc_sys_readdir, + .llseek = generic_file_llseek, + }; +_ diff --git a/kernel/readme b/kernel/readme new file mode 100644 index 000000000..cfc32d32a --- /dev/null +++ b/kernel/readme @@ -0,0 +1,5 @@ +The kernel patches series. See "series" file to obtain +order of appliance. Not all patches do address C/R directly +but some of them are needed due to dependencies. + +Has been tested on Linux 3.1-rc3. diff --git a/kernel/series b/kernel/series new file mode 100644 index 000000000..a41e8c2dd --- /dev/null +++ b/kernel/series @@ -0,0 +1,12 @@ +cr-proc-add-children +procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch +proc-fix-races-against-execve-of-proc-pid-fd.patch +proc-fix-races-against-execve-of-proc-pid-fd-fix.patch +proc-force-dcache-drop-on-unauthorized-access.patch +cr-statfs-callback-for-pipefs +cr-clone-with-pid-support +fs-proc-switch-to-dentry +cr-proc-map-files-21 +fs-proc-add-tls +fs-add-do-close +binfmt-elf-for-cr-4 diff --git a/parasite-elf.lds.S b/parasite-elf.lds.S new file mode 100644 index 000000000..83e0b40dd --- /dev/null +++ b/parasite-elf.lds.S @@ -0,0 +1,19 @@ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) + +SECTIONS +{ + . = 0; + .text : { + *(.parasite.head.text) + *(.text) + . = ALIGN(8); + } + .data : { + *(.data) + *(.rodata) + *(.bss) + *(.parasite.stack) + . = ALIGN(8); + } +} diff --git a/parasite-syscall.c b/parasite-syscall.c new file mode 100644 index 000000000..3752a404d --- /dev/null +++ b/parasite-syscall.c @@ -0,0 +1,514 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> + +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include "compiler.h" +#include "syscall.h" +#include "types.h" +#include "util.h" + +#include "parasite-syscall.h" +#include "parasite-blob.h" +#include "parasite.h" + +#ifdef CONFIG_X86_64 +static const char code_syscall[] = {0x0f, 0x05, 0xcc, 0xcc, + 0xcc, 0xcc, 0xcc, 0xcc}; + +#define code_syscall_size (round_up(sizeof(code_syscall), sizeof(long))) +#define parasite_size (round_up(sizeof(parasite_blob), sizeof(long))) + +static int syscall_fits_vma_area(struct vma_area *vma_area) +{ + return can_run_syscall((unsigned long)vma_area->vma.start, + (unsigned long)vma_area->vma.start, + (unsigned long)vma_area->vma.end); +} + +int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end) +{ + return ip >= start && ip < (end - code_syscall_size); +} + +void *mmap_seized(pid_t pid, user_regs_struct_t *regs, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + user_regs_struct_t params = *regs; + void *mmaped = NULL; + int ret; + + params.ax = (unsigned long)__NR_mmap; /* mmap */ + params.di = (unsigned long)addr; /* @addr */ + params.si = (unsigned long)length; /* @length */ + params.dx = (unsigned long)prot; /* @prot */ + params.r10 = (unsigned long)flags; /* @flags */ + params.r8 = (unsigned long)fd; /* @fd */ + params.r9 = (unsigned long)offset; /* @offset */ + + ret = syscall_seized(pid, regs, ¶ms, ¶ms); + if (ret) + goto err; + mmaped = (void *)params.ax; + + /* error code from the kernel space */ + if ((long)mmaped < 0) + mmaped = NULL; +err: + return mmaped; +} + +int munmap_seized(pid_t pid, user_regs_struct_t *regs, + void *addr, size_t length) +{ + user_regs_struct_t params = *regs; + int ret; + + params.ax = (unsigned long)__NR_munmap; /* mmap */ + params.di = (unsigned long)addr; /* @addr */ + params.si = (unsigned long)length; /* @length */ + + ret = syscall_seized(pid, regs, ¶ms, ¶ms); + if (!ret) + ret = (int)params.ax; + + return ret; +} + +int kill_seized(pid_t pid, user_regs_struct_t *where) +{ + user_regs_struct_t params = *where; + int ret; + + params.ax = (unsigned long)__NR_exit; /* exit */ + params.di = (unsigned long)-1; /* @error-code */ + + ret = syscall_seized(pid, where, ¶ms, ¶ms); + + return ret; +} + +int syscall_seized(pid_t pid, + user_regs_struct_t *where, + user_regs_struct_t *params, + user_regs_struct_t *result) +{ + user_regs_struct_t regs_orig, regs; + unsigned long start_ip; + char saved[sizeof(code_syscall)]; + siginfo_t siginfo; + int status; + int ret = -1; + + BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); + + start_ip = (unsigned long)where->ip; + + jerr(ptrace_peek_area(pid, (void *)saved, (void *)start_ip, code_syscall_size), err); + jerr(ptrace_poke_area(pid, (void *)code_syscall, (void *)start_ip, code_syscall_size), err); + +again: + jerr(ptrace(PTRACE_GETREGS, pid, NULL, ®s), err); + regs_orig = regs; + + regs.ip = start_ip; + regs.ax = params->ax; + regs.di = params->di; + regs.si = params->si; + regs.dx = params->dx; + regs.r10 = params->r10; + regs.r8 = params->r8; + regs.r9 = params->r9; + regs.orig_ax = -1; /* avoid end-of-syscall processing */ + + jerr(ptrace(PTRACE_SETREGS, pid, NULL, ®s), err_restore); + + /* + * Most ideas are taken from Tejun Heo's parasite thread + * https://code.google.com/p/ptrace-parasite/ + */ + + /* + * Run the parasite code, at the completion it'll trigger + * int3 and inform us that all is done. + */ + + jerr(ptrace(PTRACE_CONT, pid, NULL, NULL), err_restore_full); + jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full); + jerr(!WIFSTOPPED(status), err_restore_full); + jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo),err_restore_full); + + jerr(ptrace(PTRACE_GETREGS, pid, NULL, ®s), err_restore_full); + + if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) { +retry_signal: + /* pr_debug("** delivering signal %d si_code=%d\n", + siginfo.si_signo, siginfo.si_code); */ + /* FIXME: jerr(siginfo.si_code > 0, err_restore_full); */ + jerr(ptrace(PTRACE_SETREGS, pid, NULL, (void *)®s_orig), err_restore_full); + jerr(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), err_restore_full); + jerr(ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)siginfo.si_signo), err_restore_full); + + jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full); + jerr(!WIFSTOPPED(status), err_restore_full); + jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo), err_restore_full); + + if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP) + goto retry_signal; + + goto again; + } + + ret = 0; + + /* + * Our code is done. + */ + jerr(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), err_restore_full); + jerr(ptrace(PTRACE_CONT, pid, NULL, NULL), err_restore_full); + + jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full); + jerr(!WIFSTOPPED(status), err_restore_full); + jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo), err_restore_full); + + jerr((siginfo.si_code >> 8 != PTRACE_EVENT_STOP), err_restore_full); + + jerr(ptrace(PTRACE_GETREGS, pid, NULL, ®s), err_restore_full); + + ret = 0; + *result = regs; + +err_restore_full: + if (ptrace(PTRACE_SETREGS, pid, NULL, ®s_orig)) + pr_panic("Can't restore registers (pid: %d)\n", pid); + +err_restore: + if (ptrace_poke_area(pid, (void *)saved, (void *)start_ip, code_syscall_size)) + pr_panic("Crap... Can't restore data (pid: %d)\n", pid); +err: + return ret; +} + +static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list, unsigned long ip) +{ + struct vma_area *vma_area; + + list_for_each_entry(vma_area, vma_area_list, list) { + if (in_vma_area(vma_area, ip)) { + if (vma_area->vma.prot & PROT_EXEC) { + if (syscall_fits_vma_area(vma_area)) + return vma_area; + } + } + } + + return NULL; +} + +int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list, + struct cr_fdset *cr_fdset, int fd_type) +{ + parasite_args_cmd_dumppages_t parasite_dumppages = { }; + parasite_args_t parasite_arg = { }; + + user_regs_struct_t regs, regs_orig; + unsigned long nrpages_dumped = 0; + struct vma_area *vma_area; + siginfo_t siginfo; + int status, path_len, ret = -1; + + pr_info("\n"); + pr_info("Dumping pages (type: %d pid: %d)\n", fd_type, ctl->pid); + pr_info("----------------------------------------\n"); + + path_len = strlen(cr_fdset->desc[fd_type].name) + 1; + + if (path_len > sizeof(parasite_dumppages.open_path)) { + pr_panic("Dumping pages path is too long (%d while %d allowed)\n", + path_len, sizeof(parasite_dumppages.open_path)); + goto err; + } + + jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, ®s_orig), err); + + parasite_arg.command = PARASITE_CMD_DUMPPAGES; + parasite_arg.args_size = sizeof(parasite_dumppages); + parasite_arg.args = ¶site_dumppages; + + strncpy(parasite_dumppages.open_path, cr_fdset->desc[fd_type].name, + sizeof(parasite_dumppages.open_path)); + parasite_dumppages.open_flags = O_WRONLY; + parasite_dumppages.open_mode = CR_FD_PERM; + parasite_dumppages.fd = -1UL; + + /* + * Pass the command first, it's immutable. + */ + jerr(ptrace_poke_area((long)ctl->pid, (void *)¶site_arg.command, + (void *)ctl->addr_cmd, sizeof(parasite_arg.command)), + err_restore); + + list_for_each_entry(vma_area, vma_area_list, list) { + + /* + * The special areas are not dumped. + */ + if (!(vma_area->vma.status & VMA_AREA_REGULAR)) + continue; + + /* No dumps for file-shared mappings */ + if (vma_area->vma.status & VMA_FILE_SHARED) + continue; + + pr_info_vma(vma_area); + +again: + jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, ®s), err_restore); + regs.ip = ctl->parasite_ip; + jerr(ptrace(PTRACE_SETREGS, ctl->pid, NULL, ®s), err_restore); + + parasite_dumppages.vma_entry = vma_area->vma; + + if (ptrace_poke_area((long)ctl->pid, (void *)parasite_arg.args, + (void *)ctl->addr_args, parasite_arg.args_size)) { + pr_error("Can't setup parasite arguments (pid: %d)\n", ctl->pid); + goto err_restore; + } + + jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, NULL), err_restore); + jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore); + jerr(!WIFSTOPPED(status), err_restore); + jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore); + + if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) { +retry_signal: + /* pr_debug("** delivering signal %d si_code=%d\n", + siginfo.si_signo, siginfo.si_code); */ + /* FIXME: jerr(siginfo.si_code > 0, err_restore_full); */ + jerr(ptrace(PTRACE_SETREGS, (long)ctl->pid, NULL, (void *)®s_orig), err_restore); + jerr(ptrace(PTRACE_INTERRUPT, (long)ctl->pid, NULL, NULL), err_restore); + jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, (void *)(unsigned long)siginfo.si_signo), err_restore); + + jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore); + jerr(!WIFSTOPPED(status), err_restore); + jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore); + + if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP) + goto retry_signal; + + goto again; + } + + /* + * It's a bit tricky, the file get opened inside + * parasite but close via explicit syscall. Better would + * be to add some 'status' and close inside parasite on + * last call. + */ + if (parasite_dumppages.fd == -1UL) { + if (ptrace_peek_area((long)ctl->pid, + (void *)¶site_dumppages.fd, + (void *)(ctl->addr_args + + offsetof(parasite_args_cmd_dumppages_t, fd)), + sizeof(parasite_dumppages.fd))) { + pr_error("Can't get file descriptor back (pid: %d)\n", ctl->pid); + goto err_restore; + } + } + + /* + * Get some statistics. + */ + if (ptrace_peek_area((long)ctl->pid, + (void *)¶site_dumppages.nrpages_dumped, + (void *)(ctl->addr_args + + offsetof(parasite_args_cmd_dumppages_t, nrpages_dumped)), + sizeof(parasite_dumppages.fd))) { + pr_error("Can't get statistics (pid: %d)\n", ctl->pid); + goto err_restore; + } + pr_info(" (dumped: %16li pages)\n", parasite_dumppages.nrpages_dumped); + nrpages_dumped += parasite_dumppages.nrpages_dumped; + } + + /* + * Our code is done. + */ + jerr(ptrace(PTRACE_INTERRUPT, (long)ctl->pid, NULL, NULL), err_restore); + jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, NULL), err_restore); + + jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore); + jerr(!WIFSTOPPED(status), err_restore); + jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore); + + jerr((siginfo.si_code >> 8 != PTRACE_EVENT_STOP), err_restore); + + jerr(ptrace(PTRACE_GETREGS, (long)ctl->pid, NULL, ®s), err_restore); + + ret = 0; + + /* Finally close the descriptor the parasite has opened */ + if (parasite_dumppages.fd != -1UL) { + regs = regs_orig; + regs.ax = __NR_close; /* close */ + regs.di = parasite_dumppages.fd; /* @fd */ + ret = syscall_seized(ctl->pid, ®s_orig, ®s, ®s); + } + + /* + * We don't know the position in file since it's updated + * outside of our process. + */ + lseek(cr_fdset->desc[CR_FD_PAGES].fd, 0, SEEK_END); + + /* Ending page */ + write_ptr_safe(cr_fdset->desc[CR_FD_PAGES].fd, &zero_page_entry, err_restore); + + pr_info("\n"); + pr_info("Summary: %16li pages dumped\n", nrpages_dumped); + +err_restore: + if (ptrace(PTRACE_SETREGS, (long)ctl->pid, NULL, ®s_orig)) + pr_panic("Can't restore registers (pid: %d)\n", ctl->pid); + +err: + pr_info("----------------------------------------\n"); + + return ret; +} + +int parasite_cure_seized(struct parasite_ctl **p_ctl, + struct list_head *vma_area_list) +{ + user_regs_struct_t regs, regs_orig; + struct parasite_ctl *ctl; + struct vma_area *vma_area; + int ret = -1; + + if (!p_ctl || !*p_ctl) + return 0; + + ctl = *p_ctl; + + jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, ®s), err); + + regs_orig = regs; + + vma_area = get_vma_by_ip(vma_area_list, regs.ip); + if (!vma_area) { + pr_error("No suitable VMA found to run cure (pid: %d)\n", ctl->pid); + goto err; + } + + regs.ip = vma_area->vma.start; + + ret = munmap_seized(ctl->pid, ®s, + (void *)ctl->vma_area->vma.start, + (size_t)vma_entry_len(&ctl->vma_area->vma)); + if (ret) + pr_error("munmap_seized failed (pid: %d)\n", ctl->pid); + + if (ptrace(PTRACE_SETREGS, ctl->pid, NULL, ®s_orig)) { + ret = -1; + pr_panic("PTRACE_SETREGS failed (pid: %d)\n", ctl->pid); + } + + free(*p_ctl), *p_ctl = NULL; +err: + return ret; +} + +struct parasite_ctl *parasite_infect_seized(pid_t pid, void *addr_hint, struct list_head *vma_area_list) +{ + user_regs_struct_t regs, regs_orig; + struct parasite_ctl *ctl = NULL; + struct vma_area *vma_area; + void *mmaped; + + ctl = xzalloc(sizeof(*ctl) + sizeof(*vma_area)); + if (!ctl) { + pr_error("Parasite control block allocation failed (pid: %d)\n", pid); + goto err; + } + + /* Setup control block */ + ctl->pid = pid; + ctl->vma_area = (struct vma_area *)(char *)&ctl[sizeof(*ctl)]; + + if (ptrace(PTRACE_GETREGS, pid, NULL, ®s)) + pr_error_jmp(err_free); + + vma_area = get_vma_by_ip(vma_area_list, regs.ip); + if (!vma_area) { + pr_error("No suitable VMA found to run parasite " + "bootstrap code (pid: %d)\n", pid); + goto err_free; + } + + regs_orig = regs; + + /* + * Prepare for in-process syscall. + */ + ctl->vma_area->vma.prot = PROT_READ | PROT_WRITE | PROT_EXEC; + ctl->vma_area->vma.flags = MAP_PRIVATE | MAP_ANONYMOUS; + + regs.ip = vma_area->vma.start; + + mmaped = mmap_seized(pid, ®s, addr_hint, (size_t)parasite_size, + (int)ctl->vma_area->vma.prot, + (int)ctl->vma_area->vma.flags, + (int)-1, (off_t)0); + + if (!mmaped || (long)mmaped < 0) { + pr_error("Can't allocate memory for parasite blob (pid: %d)\n", pid); + goto err_restore_regs; + } + + ctl->parasite_ip = PARASITE_HEAD_ADDR((unsigned long)mmaped); + ctl->parasite_complete_ip = PARASITE_COMPLETE_ADDR((unsigned long)mmaped); + ctl->addr_cmd = PARASITE_CMD_ADDR((unsigned long)mmaped); + ctl->addr_args = PARASITE_ARGS_ADDR((unsigned long)mmaped); + + ctl->vma_area->vma.start= (u64)mmaped; + ctl->vma_area->vma.end = (u64)(mmaped + parasite_size); + + if (ptrace_poke_area(pid, parasite_blob, mmaped, parasite_size)) { + pr_error("Can't inject parasite blob (pid: %d)\n", pid); + goto err_munmap_restore; + } + + jerr(ptrace(PTRACE_SETREGS, pid, NULL, ®s_orig), err_munmap_restore); + + return ctl; + +err_munmap_restore: + regs = regs_orig, regs.ip = vma_area->vma.start; + if (munmap_seized(pid, ®s, mmaped, parasite_size)) + pr_panic("mmap_seized failed (pid: %d)\n", pid); +err_restore_regs: + if (ptrace(PTRACE_SETREGS, pid, NULL, ®s_orig)) + pr_panic("PTRACE_SETREGS failed (pid: %d)\n", pid); +err_free: + if (ctl) + free(ctl); +err: + return NULL; +} + +#else /* CONFIG_X86_64 */ +# error x86-32 is not yet implemented +#endif /* CONFIG_X86_64 */ diff --git a/parasite.c b/parasite.c new file mode 100644 index 000000000..fca9cfd62 --- /dev/null +++ b/parasite.c @@ -0,0 +1,339 @@ +#include <stdio.h> +#include <stdlib.h> + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include "compiler.h" +#include "types.h" +#include "syscall.h" +#include "parasite.h" +#include "image.h" +#include "crtools.h" + +#ifdef CONFIG_X86_64 + +static void *brk_start, *brk_end, *brk_tail; + +static struct page_entry page; +static struct vma_entry vma; + +void *memcpy(void *dest, const void *src, size_t n) +{ + long d0, d1, d2; + asm volatile( + "rep ; movsq\n\t" + "movq %4,%%rcx\n\t" + "rep ; movsb\n\t" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) + : "memory"); + + return dest; +} + +static void brk_init(void *brk) +{ + brk_start = brk_tail = brk; + brk_end = brk_start + PARASITE_BRK_SIZE; +} + +static void *brk_alloc(unsigned long bytes) +{ + void *addr = NULL; + if (brk_end > (brk_tail + bytes)) { + addr = brk_tail; + brk_tail+= bytes; + } + return addr; +} + +static void brk_free(unsigned long bytes) +{ + if (brk_start >= (brk_tail - bytes)) + brk_tail -= bytes; +} + +static unsigned long builtin_strlen(char *str) +{ + unsigned long len = 0; + while (*str++) + len++; + return len; +} + +static const unsigned char hex[] = "0123456789abcdef"; +static char *long2hex(unsigned long v) +{ + static char buf[32]; + char *p = buf; + int i; + + for (i = sizeof(long) - 1; i >= 0; i--) { + *p++ = hex[ ((((unsigned char *)&v)[i]) & 0xf0) >> 4 ]; + *p++ = hex[ ((((unsigned char *)&v)[i]) & 0x0f) >> 0 ]; + } + *p = 0; + + return buf; +} + +static void sys_write_msg(const char *msg) +{ + int size = 0; + while (msg[size]) + size++; + sys_write(1, msg, size); +} + +static int restore_core(char *corefile) +{ + int ret = PARASITE_ERR_FAIL; + int fd_core; + + fd_core = (int)sys_open(corefile, O_RDONLY, 0600); + if (fd_core < 0) { + ret = PARASITE_ERR_OPEN; + goto err_open; + } + + /* Skip the header */ + sys_lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET); + + /* First VMA areas */ + while (1) { + unsigned long addr; + + ret = sys_read(fd_core, &vma, sizeof(vma)); + if (ret && ret != sizeof(vma)) { + ret = PARASITE_ERR_CORE_VMA; + goto err; + } + + if (vma.start == 0 && vma.end == 0) + break; + + /* Make sure it's mapped into proper place */ + addr = sys_mmap((void *)vma.start, + vma.end - vma.start, + vma.prot, + vma.flags | MAP_FIXED, + vma.fd, + vma.pgoff); + if (addr != vma.start) { + ret = PARASITE_ERR_MMAP; + goto err; + } + } + + /* Now pages */ + while (1) { + unsigned long count; + + ret = sys_read(fd_core, &page.va, sizeof(page.va)); + if (ret && ret != sizeof(page.va)) { + ret = PARASITE_ERR_CORE_PAGE; + goto err; + } + + if (page.va == 0) + break; + + ret = sys_read(fd_core, page.data, sizeof(page.data)); + if (ret && ret != sizeof(page.data)) { + ret = PARASITE_ERR_CORE_PAGE; + goto err; + } + + memcpy((void *)page.va, page.data, sizeof(page.data)); + } + + ret = 0; + +err: + sys_close(fd_core); + +err_open: + return ret; +} + +static int dump_pages(parasite_args_cmd_dumppages_t *args) +{ + int ret = PARASITE_ERR_FAIL; + unsigned long nrpages, pfn, length; + unsigned long prot_old, prot_new; + unsigned char *map_brk = NULL; + unsigned char *map; + bool dump_all = false; + + args->nrpages_dumped = 0; + prot_old = prot_new = 0; + + if (args->fd == -1UL) { + args->fd = sys_open(args->open_path, args->open_flags, args->open_mode); + if (args->fd < 0) { + sys_write_msg("sys_open failed\n"); + ret = PARASITE_ERR_OPEN; + goto err; + } + } + + /* Start from the end of file */ + sys_lseek(args->fd, 0, SEEK_END); + + length = args->vma_entry.end - args->vma_entry.start; + nrpages = length / PAGE_SIZE; + + /* + * brk should allow us to handle up to 128M of memory, + * otherwise call for mmap. + */ + map = brk_alloc(nrpages); + if (map) { + map_brk = map; + } else { + map = (void *)sys_mmap(NULL, nrpages, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if ((long)map < 0) { + sys_write_msg("sys_mmap failed\n"); + ret = PARASITE_ERR_MMAP; + goto err; + } + } + + dump_all = !!(args->vma_entry.status & VMA_DUMP_ALL); + + /* + * Try to change page protection if needed so we would + * be able to dump contents. + */ + if (!(args->vma_entry.prot & PROT_READ)) { + prot_old = (unsigned long)args->vma_entry.prot; + prot_new = prot_old | PROT_READ; + if (sys_mprotect((unsigned long)args->vma_entry.start, + (unsigned long)vma_entry_len(&args->vma_entry), + prot_new)) { + sys_write_msg("sys_mprotect failed\n"); + ret = PARASITE_ERR_MPROTECT; + goto err_free; + } + } + + /* + * Dumping the whole VMA range is not a common operation + * so stick for mincore as a basis. + */ + + if (sys_mincore((unsigned long)args->vma_entry.start, length, map)) { + sys_write_msg("sys_mincore failed\n"); + ret = PARASITE_ERR_MINCORE; + goto err_free; + } + + ret = 0; + for (pfn = 0; pfn < nrpages; pfn++) { + unsigned long vaddr, written; + + if ((map[pfn] & PAGE_RSS) || dump_all) { + /* + * That's the optimized write of + * page_entry structure, see image.h + */ + vaddr = (unsigned long)args->vma_entry.start + pfn * PAGE_SIZE; + written = 0; + + written += sys_write(args->fd, &vaddr, sizeof(vaddr)); + written += sys_write(args->fd, (void *)vaddr, PAGE_SIZE); + if (written != sizeof(vaddr) + PAGE_SIZE) { + ret = PARASITE_ERR_WRITE; + sys_write_msg("sys_write on page failed\n"); + goto err_free; + } + + args->nrpages_dumped++; + } + } + + /* + * Don't left pages readable if they were not. + */ + if (prot_old != prot_new) { + if (sys_mprotect((unsigned long)args->vma_entry.start, + (unsigned long)vma_entry_len(&args->vma_entry), + prot_old)) { + sys_write_msg("PANIC: Ouch! sys_mprotect failed on resore\n"); + ret = PARASITE_ERR_MPROTECT; + goto err_free; + } + } + +err_free: + if (map_brk) + brk_free(nrpages); + else + sys_munmap(map, nrpages); +err: + return ret; +} + +static int __used parasite_service(unsigned long cmd, void *args, void *brk) +{ + brk_init(brk); + + switch (cmd) { + case PARASITE_CMD_KILLME: + sys_close(0); + break; + case PARASITE_CMD_PINGME: + break; + case PARASITE_CMD_DUMPPAGES: + return dump_pages((parasite_args_cmd_dumppages_t *)args); + break; + case PARASITE_CMD_RESTORECORE: + return restore_core((char *)args); + break; + default: + sys_write_msg("Unknown command to parasite\n"); + break; + } + + return 0; +} + +static void __parasite_head __used parasite_head(void) +{ + /* + * The linker will handle the stack allocation. + */ + asm volatile("parasite_head_start: \n\t" + "leaq parasite_stack(%rip), %rsp \n\t" + "pushq $0 \n\t" + "movq %rsp, %rbp \n\t" + "movl parasite_cmd(%rip), %edi \n\t" + "leaq parasite_args(%rip), %rsi \n\t" + "leaq parasite_brk(%rip), %rdx \n\t" + "call parasite_service \n\t" + "parasite_service_complete: \n\t" + "int $0x03 \n\t" + ".align 8 \n\t" + "parasite_cmd: \n\t" + ".long 0 \n\t" + "parasite_args: \n\t" + ".long 0 \n\t" + ".skip "__stringify(PARASITE_ARG_SIZE)",0 \n\t" + ".skip "__stringify(PARASITE_STACK_SIZE)", 0 \n\t" + "parasite_stack: \n\t" + ".long 0 \n\t" + "parasite_brk: \n\t" + ".skip "__stringify(PARASITE_BRK_SIZE)", 0 \n\t" + ".long 0 \n\t"); +} + +#else /* CONFIG_X86_64 */ +# error x86-32 bit mode not yet implemented +#endif /* CONFIG_X86_64 */ diff --git a/parasite.lds.S b/parasite.lds.S new file mode 100644 index 000000000..0f3aa327c --- /dev/null +++ b/parasite.lds.S @@ -0,0 +1,19 @@ +OUTPUT_FORMAT("binary") +OUTPUT_ARCH(i386:x86-64) + +SECTIONS +{ + . = 0; + .text : { + *(.parasite.head.text) + *(.text) + . = ALIGN(8); + } + .data : { + *(.data) + *(.rodata) + *(.bss) + *(.parasite.stack) + . = ALIGN(8); + } +} diff --git a/rbtree.c b/rbtree.c new file mode 100644 index 000000000..bfaf22674 --- /dev/null +++ b/rbtree.c @@ -0,0 +1,322 @@ +/* + * RBtree implementation adopted from the Linux + * kernel sources. + */ + +#include "rbtree.h" + +static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *right = node->rb_right; + struct rb_node *parent = rb_parent(node); + + if ((node->rb_right = right->rb_left)) + rb_set_parent(right->rb_left, node); + right->rb_left = node; + + rb_set_parent(right, parent); + + if (parent) { + if (node == parent->rb_left) + parent->rb_left = right; + else + parent->rb_right = right; + } else + root->rb_node = right; + rb_set_parent(node, right); +} + +static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *left = node->rb_left; + struct rb_node *parent = rb_parent(node); + + if ((node->rb_left = left->rb_right)) + rb_set_parent(left->rb_right, node); + left->rb_right = node; + + rb_set_parent(left, parent); + + if (parent) { + if (node == parent->rb_right) + parent->rb_right = left; + else + parent->rb_left = left; + } else + root->rb_node = left; + rb_set_parent(node, left); +} + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *parent, *gparent; + + while ((parent = rb_parent(node)) && rb_is_red(parent)) { + gparent = rb_parent(parent); + + if (parent == gparent->rb_left) { + { + register struct rb_node *uncle = gparent->rb_right; + if (uncle && rb_is_red(uncle)) { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_right == node) { + register struct rb_node *tmp; + __rb_rotate_left(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_right(gparent, root); + } else { + { + register struct rb_node *uncle = gparent->rb_left; + if (uncle && rb_is_red(uncle)) { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_left == node) { + register struct rb_node *tmp; + __rb_rotate_right(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_left(gparent, root); + } + } + + rb_set_black(root->rb_node); +} + +static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, + struct rb_root *root) +{ + struct rb_node *other; + + while ((!node || rb_is_black(node)) && node != root->rb_node) { + if (parent->rb_left == node) { + other = parent->rb_right; + if (rb_is_red(other)) { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_left(parent, root); + other = parent->rb_right; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } else { + if (!other->rb_right || rb_is_black(other->rb_right)) { + rb_set_black(other->rb_left); + rb_set_red(other); + __rb_rotate_right(other, root); + other = parent->rb_right; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_right); + __rb_rotate_left(parent, root); + node = root->rb_node; + break; + } + } else { + other = parent->rb_left; + if (rb_is_red(other)) { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_right(parent, root); + other = parent->rb_left; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } else { + if (!other->rb_left || rb_is_black(other->rb_left)) { + rb_set_black(other->rb_right); + rb_set_red(other); + __rb_rotate_left(other, root); + other = parent->rb_left; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_left); + __rb_rotate_right(parent, root); + node = root->rb_node; + break; + } + } + } + if (node) + rb_set_black(node); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *child, *parent; + int color; + + if (!node->rb_left) + child = node->rb_right; + else if (!node->rb_right) + child = node->rb_left; + else { + struct rb_node *old = node, *left; + + node = node->rb_right; + while ((left = node->rb_left) != NULL) + node = left; + + if (rb_parent(old)) { + if (rb_parent(old)->rb_left == old) + rb_parent(old)->rb_left = node; + else + rb_parent(old)->rb_right = node; + } else + root->rb_node = node; + + child = node->rb_right; + parent = rb_parent(node); + color = rb_color(node); + + if (parent == old) { + parent = node; + } else { + if (child) + rb_set_parent(child, parent); + parent->rb_left = child; + + node->rb_right = old->rb_right; + rb_set_parent(old->rb_right, node); + } + + node->rb_parent_color = old->rb_parent_color; + node->rb_left = old->rb_left; + rb_set_parent(old->rb_left, node); + + goto color; + } + + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + if (parent) { + if (parent->rb_left == node) + parent->rb_left = child; + else + parent->rb_right = child; + } else + root->rb_node = child; + + color: + if (color == RB_BLACK) + __rb_erase_color(child, parent, root); +} + +struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} + +struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} + +struct rb_node *rb_next(const struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + if (node->rb_right) { + node = node->rb_right; + while (node->rb_left) + node=node->rb_left; + return (struct rb_node *)node; + } + + while ((parent = rb_parent(node)) && node == parent->rb_right) + node = parent; + + return parent; +} + +struct rb_node *rb_prev(const struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + if (node->rb_left) { + node = node->rb_left; + while (node->rb_right) + node=node->rb_right; + return (struct rb_node *)node; + } + + while ((parent = rb_parent(node)) && node == parent->rb_left) + node = parent; + + return parent; +} + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + if (parent) { + if (victim == parent->rb_left) + parent->rb_left = new; + else + parent->rb_right = new; + } else { + root->rb_node = new; + } + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + *new = *victim; +} diff --git a/testee-static.c b/testee-static.c new file mode 100644 index 000000000..39b764d9b --- /dev/null +++ b/testee-static.c @@ -0,0 +1,112 @@ +/* + * A simple testee program + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> + +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> + +#include <sched.h> + +int main(int argc, char *argv[]) +{ +// int pipefd[2]; + int fd_shared, fd_private; + const char data_mark[] = "This is a data_mark marker"; + void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable; + void *mmap_anon_shared; + const char sep[] = "----------"; + unsigned long buf; + int i; + + (void)data_mark; + + printf("%s pid %d\n", argv[0], getpid()); + +// if (pipe(pipefd)) { +// perror("Can't create pipe"); +// goto err; +// } + + fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd_shared < 0) { + perror("Can't open fd_shared file"); + goto err; + } + + fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd_private < 0) { + perror("Can't open fd_private file"); + goto err; + } + + if (lseek(fd_shared, 1024, SEEK_SET) == -1 || + lseek(fd_private, 1024, SEEK_SET) == -1) { + perror("Can't llsek"); + goto err; + } + + write(fd_shared, "", 1); + write(fd_private, "", 1); + + mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0); + mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0); + mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + mmap_anon_shared= mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + + if (mmap_shared == MAP_FAILED || + mmap_private == MAP_FAILED || + mmap_anon_shared == MAP_FAILED || + mmap_anon == MAP_FAILED || + map_unreadable == MAP_FAILED) { + + perror("mmap failed"); + goto err; + } + + strcpy((char *)mmap_shared, sep); + strcpy((char *)mmap_private, sep); + strcpy((char *)mmap_anon, sep); + strcpy((char *)map_unreadable, sep); + strcpy((char *)mmap_anon_shared,sep); + + for (i = 64; i < 128; i++) { + ((char *)mmap_shared)[i] = 0 + i; + ((char *)mmap_private)[i] = 64 + i; + ((char *)mmap_anon)[i] = 128 + i; + ((char *)map_unreadable)[i] = 190 + i; + ((char *)mmap_anon_shared)[i] = 0 + i; + } + + if (mprotect(map_unreadable, 1024, PROT_NONE)) { + perror("mprotect failed"); + goto err; + } + + asm volatile("" ::: "memory"); + + fsync(fd_shared); + fsync(fd_private); + + sync(); + asm volatile("" ::: "memory"); + + while (1) { + printf("ping: %d\n", getpid()); +// write(pipefd[1], &buf, sizeof(buf)); + sleep(6); + } + +err: + /* resources are released by kernel */ + return 0; +} diff --git a/testee-threads.c b/testee-threads.c new file mode 100644 index 000000000..cacc1eb9a --- /dev/null +++ b/testee-threads.c @@ -0,0 +1,74 @@ +/* + * A simple testee program with threads + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <pthread.h> + + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static int counter; + +static void *f1(void *arg) +{ + void *map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + (void)map_unreadable; + + while (1) { + pthread_mutex_lock(&mtx); + + counter++; + /* printf("Counter value: %d\n", counter); */ + + pthread_mutex_unlock(&mtx); + sleep(2); + } + + return NULL; +} + +static void *f2(void *arg) +{ + void *map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + (void)map_unreadable; + + while (1) { + pthread_mutex_lock(&mtx); + + counter++; + /* printf("Counter value: %d\n", counter); */ + + pthread_mutex_unlock(&mtx); + sleep(3); + } + + return NULL; +} + +int main(int argc, char *argv[]) +{ + pthread_t th1, th2; + int rc1, rc2; + + printf("%s pid %d\n", argv[0], getpid()); + + rc1 = pthread_create(&th1, NULL, &f1, NULL); + rc2 = pthread_create(&th2, NULL, &f2, NULL); + + if (rc1 | rc2) + exit(1); + + pthread_join(th1, NULL); + pthread_join(th2, NULL); + + exit(0); +} diff --git a/testee-unlinked.c b/testee-unlinked.c new file mode 100644 index 000000000..7287f52c8 --- /dev/null +++ b/testee-unlinked.c @@ -0,0 +1,92 @@ +/* + * A simple testee program + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> + +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> + +#include <sched.h> + +int main(int argc, char *argv[]) +{ + int fd_shared, fd_private; + const char data_mark[] = "This is a data_mark marker"; + void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable; + const char sep[] = "----------"; + pid_t pid, child; + int i; + + printf("%s pid %d\n", argv[0], getpid()); + + fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd_shared < 0) { + perror("Can't open fd_shared file"); + goto err; + } + + fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd_private < 0) { + perror("Can't open fd_private file"); + goto err; + } + + if (lseek(fd_shared, 1024, SEEK_SET) == -1 || + lseek(fd_private, 1024, SEEK_SET) == -1) { + perror("Can't llsek"); + goto err; + } + + write(fd_shared, "", 1); + write(fd_private, "", 1); + + mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0); + mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0); + + if (mmap_shared == MAP_FAILED || + mmap_private == MAP_FAILED) { + + perror("mmap failed"); + goto err; + } + + strcpy((char *)mmap_shared, sep); + strcpy((char *)mmap_private, sep); + + for (i = 64; i < 128; i++) { + ((char *)mmap_shared)[i] = 0 + i; + ((char *)mmap_private)[i] = 64 + i; + } + + fsync(fd_shared); + fsync(fd_private); + + close(fd_shared); + fsync(fd_private); + + unlink("testee-shared.img"); + unlink("testee-private.img"); + + for (i = 64; i < 128; i++) { + ((char *)mmap_shared)[i] = 0 + i; + ((char *)mmap_private)[i] = 64 + i; + } + + msync(mmap_shared, 1024, MS_SYNC); + msync(mmap_private, 1024, MS_SYNC); + + while (1) + sleep(1); + +err: + /* resources are released by kernel */ + return 0; +} diff --git a/testee.c b/testee.c new file mode 100644 index 000000000..b65fdbbbe --- /dev/null +++ b/testee.c @@ -0,0 +1,231 @@ +/* + * A simple testee program + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> + +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> + +#include <sched.h> + +static int do_child(void *arg) +{ + printf("do_child pid: %d\n", getpid()); + + void *stack, *mmap_anon; + + stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0); + if (stack == MAP_FAILED) + return -1; + + mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmap_anon == MAP_FAILED) + return -1; + + while (1) + sleep(6); + + return 0; +} + +static int run_clone(void) +{ + pid_t pid = 0; + int ret = 0; + void *stack, *mmap_anon; + + stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0); + if (stack == MAP_FAILED) + return -1; + + mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmap_anon == MAP_FAILED) + return -1; + + stack += 4 * 4096; + + ret = clone(do_child, stack, CLONE_FS, NULL, NULL, NULL, &pid); + if (ret < 0) + perror("Failed clone"); + + printf("run_clone: %d stack: %p mmap_anon: %p ret %d\n", + pid, stack, mmap_anon, ret); + + if (stack == MAP_FAILED) + return -1; + + mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmap_anon == MAP_FAILED) + return -1; + + stack += 4 * 4096; + + ret = clone(do_child, stack, CLONE_FS | CLONE_FILES | CLONE_VM, NULL, NULL, NULL, &pid); + if (ret < 0) + perror("Failed clone"); + + printf("run_clone: %d stack: %p mmap_anon: %p ret %d\n", + pid, stack, mmap_anon, ret); + + return ret; +} + +int main(int argc, char *argv[]) +{ +// int pipefd[2]; + int fd_shared, fd_private; + const char data_mark[] = "This is a data_mark marker"; + void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable; + const char sep[] = "----------"; + pid_t pid, child; + char suided_path[128]; + int i; + + (void)data_mark; + + printf("%s pid %d\n", argv[0], getpid()); + +// if (pipe(pipefd)) { +// perror("Can't create pipe"); +// goto err; +// } + + fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd_shared < 0) { + perror("Can't open fd_shared file"); + goto err; + } + + fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd_private < 0) { + perror("Can't open fd_private file"); + goto err; + } + + if (lseek(fd_shared, 1024, SEEK_SET) == -1 || + lseek(fd_private, 1024, SEEK_SET) == -1) { + perror("Can't llsek"); + goto err; + } + + write(fd_shared, "", 1); + write(fd_private, "", 1); + + mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0); + mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0); + mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (mmap_shared == MAP_FAILED || + mmap_private == MAP_FAILED || + mmap_anon == MAP_FAILED || + map_unreadable == MAP_FAILED) { + + perror("mmap failed"); + goto err; + } + + snprintf(suided_path, sizeof(suided_path), + "/proc/%d/map_files/%lx-%lx", + getpid(), (long)mmap_shared, + (long)mmap_shared + 0x1000); + + strcpy((char *)mmap_shared, sep); + strcpy((char *)mmap_private, sep); + strcpy((char *)mmap_anon, sep); + strcpy((char *)map_unreadable, sep); + + for (i = 64; i < 128; i++) { + ((char *)mmap_shared)[i] = 0 + i; + ((char *)mmap_private)[i] = 64 + i; + ((char *)mmap_anon)[i] = 128 + i; + ((char *)map_unreadable)[i] = 190 + i; + } + + if (mprotect(map_unreadable, 1024, PROT_NONE)) { + perror("mprotect failed"); + goto err; + } + + asm volatile("" ::: "memory"); + + fsync(fd_shared); + fsync(fd_private); + + close(fd_shared); + + if (argc > 1) { + + printf("my-uid: %d\n", getuid()); + setuid(atoi(argv[1])); + printf("my-uid: %d\n", getuid()); + } + + fd_shared = open(suided_path, O_RDWR, 0600); + printf("fd_shared for O_RDWR: %d\n", fd_shared); + if (fd_shared >= 0) { + write(fd_shared, "aaaa", sizeof("aaaa")); + close(fd_shared); + } + + fd_shared = open(suided_path, O_TRUNC, 0600); + printf("fd_shared for O_TRUNC: %d\n", fd_shared); + if (fd_shared >= 0) { + printf("tunc: %d\n", ftruncate(fd_shared, 512)); + close(fd_shared); + } + + fd_shared = open(suided_path, O_RDONLY, 0600); + printf("fd_shared for O_RDONLY: %d\n", fd_shared); + if (fd_shared >= 0) + close(fd_shared); + + sync(); + asm volatile("" ::: "memory"); + + pid = fork(); + if (pid == -1) + goto err; + + if (pid == 0) { + long buf; + child = fork(); + if (child == -1) + goto err; + if (child == 0) { + printf("first child pid: %d\n", getpid()); +// while (read(pipefd[0], &buf, sizeof(buf)) > 0) +// sleep(3); + while (1) { + printf("ping: %d\n", getpid()); + sleep(8); + } + } else { + printf("first parent pid: %d\n", getpid()); +// run_clone(); + while (1) { + printf("ping: %d\n", getpid()); + sleep(9); + } + } + } else { + long buf = 0xdeadbeef; + while (1) { + printf("ping: %d\n", getpid()); +// write(pipefd[1], &buf, sizeof(buf)); + sleep(10); + } + } + +err: + /* resources are released by kernel */ + return 0; +} @@ -0,0 +1,412 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <stdbool.h> +#include <limits.h> + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/wait.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <limits.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <dirent.h> + +#include <fcntl.h> + +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/vfs.h> +#include <sys/ptrace.h> +#include <sys/user.h> +#include <sys/wait.h> + +#include "compiler.h" +#include "types.h" +#include "list.h" +#include "util.h" + +#include "crtools.h" + +static char big_buffer[PATH_MAX]; + +void printk(const char *format, ...) +{ + va_list params; + + va_start(params, format); + vfprintf(stdout, format, params); + va_end(params); +} + +int ptrace_show_area_r(pid_t pid, void *addr, long bytes) +{ + unsigned long w, i; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *a = addr; + unsigned long v; + v = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); + if (v == -1U && errno) + goto err; + else { + unsigned char *c = (unsigned char *)&v; + for (i = sizeof(v)/sizeof(*c); i > 0; i--) + printk("%02x ", c[i - 1]); + printk(" "); + } + } + printk("\n"); + return 0; +err: + return -2; +} + +int ptrace_show_area(pid_t pid, void *addr, long bytes) +{ + unsigned long w, i; + if (bytes & (sizeof(long) - 1)) + return -1; + printk("%016lx: ", (unsigned long)addr); + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *a = addr; + unsigned long v; + v = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); + if (v == -1U && errno) + goto err; + else { + unsigned char *c = (unsigned char *)&v; + for (i = 0; i < sizeof(v)/sizeof(*c); i++) + printk("%02x ", c[i]); + printk(" "); + } + } + printk("\n"); + return 0; +err: + return -2; +} + +int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) +{ + unsigned long w; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *d = dst, *a = addr; + d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); + if (d[w] == -1U && errno) + goto err; + } + return 0; +err: + return -2; +} + +int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) +{ + unsigned long w; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *s = src, *a = addr; + if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) + goto err; + } + return 0; +err: + return -2; +} + +void printk_registers(user_regs_struct_t *regs) +{ + printk("ip : %16lx cs : %16lx ds : %16lx\n" + "es : %16lx fs : %16lx gs : %16lx\n" + "sp : %16lx ss : %16lx flags : %16lx\n" + "ax : %16lx cx : %16lx dx : %16lx\n" + "si : %16lx di : %16lx bp : %16lx\n" + "bx : %16lx r8 : %16lx r9 : %16lx\n" + "r10 : %16lx r11 : %16lx r12 : %16lx\n" + "r13 : %16lx r14 : %16lx r15 : %16lx\n" + "orig_ax: %16lx fs_base: %16lx gs_base: %16lx\n\n", + regs->ip, regs->cs, regs->ds, + regs->es, regs->fs, regs->gs, + regs->sp, regs->ss, regs->flags, + regs->ax, regs->cx, regs->dx, + regs->si, regs->di, regs->bp, + regs->bx, regs->r8, regs->r9, + regs->r10, regs->r11, regs->r12, + regs->r13, regs->r14, regs->r15, + regs->orig_ax, regs->fs_base, regs->gs_base); +} + +void printk_siginfo(siginfo_t *siginfo) +{ + printk("si_signo %d si_errno %d si_code %d\n", + siginfo->si_signo, siginfo->si_errno, siginfo->si_code); +} + +void printk_vma(struct vma_area *vma_area) +{ + if (!vma_area) + return; + + printk("s: %16lx e: %16lx l: %4liK p: %4x f: %4x fd: %4d pid: %4d dev:%02x:%02x:%04lx vf: %s st: %s spc: %s\n", + vma_area->vma.start, vma_area->vma.end, + (vma_area->vma.end - vma_area->vma.start) >> 10, + vma_area->vma.prot, + vma_area->vma.flags, + vma_area->vma.fd, + vma_area->vma.pid, + vma_area->vma.dev_maj, + vma_area->vma.dev_min, + vma_area->vma.ino, + vma_area->vm_file_fd < 0 ? "n" : "y", + !vma_area->vma.status ? "--" : + ((vma_area->vma.status & VMA_FILE_PRIVATE) ? "FP" : + ((vma_area->vma.status & VMA_FILE_SHARED) ? "FS" : + ((vma_area->vma.status & VMA_ANON_SHARED) ? "AS" : + ((vma_area->vma.status & VMA_ANON_PRIVATE) ? "AP" : "--")))), + !vma_area->vma.status ? "--" : + ((vma_area->vma.status & VMA_AREA_STACK) ? "stack" : + ((vma_area->vma.status & VMA_AREA_VSYSCALL) ? "vsyscall" : + ((vma_area->vma.status & VMA_AREA_VDSO) ? "vdso" : "n")))); +} + +int unseize_task(pid_t pid) +{ + return ptrace(PTRACE_DETACH, pid, NULL, NULL); +} + +int seize_task(pid_t pid) +{ + siginfo_t si; + int status; + int ret = 0; + + jerr_rc(ptrace(PTRACE_SEIZE, pid, NULL, + (void *)(unsigned long)PTRACE_SEIZE_DEVEL), ret, err); + jerr_rc(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), ret, err); + + ret = -10; + if (wait4(pid, &status, __WALL, NULL) != pid) + goto err; + + ret = -20; + if (!WIFSTOPPED(status)) + goto err; + + jerr_rc(ptrace(PTRACE_GETSIGINFO, pid, NULL, &si), ret, err_cont); + + ret = -30; + if ((si.si_code >> 8) != PTRACE_EVENT_STOP) + goto err_cont; + + jerr_rc(ptrace(PTRACE_SETOPTIONS, pid, NULL, + (void *)(unsigned long)PTRACE_O_TRACEEXIT), ret, err_cont); + +err: + return ret; + +err_cont: + continue_task(pid); + goto err; +} + +int reopen_fd_as(int new_fd, int old_fd) +{ + if (old_fd != new_fd) { + int tmp = dup2(old_fd, new_fd); + if (tmp < 0) + return tmp; + close(old_fd); + } + + return new_fd; +} + +int parse_maps(pid_t pid, struct list_head *vma_area_list) +{ + struct vma_area *vma_area = NULL; + u64 start, end, pgoff; + char map_files_path[64]; + char maps_path[64]; + unsigned long ino; + char r,w,x,s; + int dev_maj, dev_min; + int ret = -1; + + DIR *map_files_dir = NULL; + FILE *maps = NULL; + + snprintf(maps_path, sizeof(maps_path), "/proc/%d/maps", pid); + maps = fopen(maps_path, "r"); + if (!maps) { + pr_perror("Can't open: %s\n", maps_path); + goto err; + } + + snprintf(map_files_path, sizeof(map_files_path), + "/proc/%d/map_files", pid); + + /* + * It might be a problem in kernel, either + * I'm debugging it on old kernel ;) + */ + map_files_dir = opendir(map_files_path); + if (!map_files_dir) + pr_warning("Crap, can't open %s, old kernel?\n", + map_files_path); + + while (fgets(big_buffer, sizeof(big_buffer), maps)) { + char vma_file_path[16+16+2]; + struct stat st_buf; + + ret = sscanf(big_buffer, "%lx-%lx %c%c%c%c %lx %02x:%02x %lu", + &start, &end, &r, &w, &x, &s, &pgoff, &dev_maj, + &dev_min, &ino); + if (ret != 10) { + pr_error("Can't parse: %s", big_buffer); + return -1; + } + + vma_area = alloc_vma_area(); + if (!vma_area) + return -1; + + /* Figure out if it's file mapping */ + snprintf(vma_file_path, sizeof(vma_file_path), "%lx-%lx", start, end); + + if (map_files_dir) { + /* + * Note that we "open" it in dumper process space + * so later we might refer to it via /proc/self/fd/vm_file_fd + * if needed. + */ + vma_area->vm_file_fd = openat(dirfd(map_files_dir), + vma_file_path, O_RDONLY); + if (vma_area->vm_file_fd < 0) { + if (errno != ENOENT) { + pr_perror("Failed opening %s/%s\n", + map_files_path, + vma_file_path); + goto err; + } + } + } + + vma_area->vma.pid = pid; + vma_area->vma.start = start; + vma_area->vma.end = end; + vma_area->vma.pgoff = pgoff; + + vma_area->vma.ino = ino; + vma_area->vma.dev_maj = dev_maj; + vma_area->vma.dev_min = dev_min; + + vma_area->vma.prot = PROT_NONE; + + if (r == 'r') + vma_area->vma.prot |= PROT_READ; + if (w == 'w') + vma_area->vma.prot |= PROT_WRITE; + if (x == 'x') + vma_area->vma.prot |= PROT_EXEC; + + if (s == 's') + vma_area->vma.flags = MAP_SHARED; + else if (s == 'p') + vma_area->vma.flags = MAP_PRIVATE; + + vma_area->vma.status = 0; + + if (strstr(big_buffer, "[stack]")) + vma_area->vma.status |= VMA_AREA_REGULAR | VMA_AREA_STACK; + else if (strstr(big_buffer, "[vsyscall]")) + vma_area->vma.status |= VMA_AREA_VSYSCALL; + else if (strstr(big_buffer, "[vdso]")) + vma_area->vma.status |= VMA_AREA_VDSO; + else if (strstr(big_buffer, "[heap]")) + vma_area->vma.status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + else + vma_area->vma.status = VMA_AREA_REGULAR; + + /* + * Some mapping hints for restore, we save this on + * disk and restore might need to analyze it. + */ + if (vma_area->vm_file_fd >= 0) { + + if (fstat(vma_area->vm_file_fd, &st_buf) < 0) { + pr_perror("Failed fstat on %s%s\n", + map_files_path, + vma_file_path); + goto err; + } + if (!S_ISREG(st_buf.st_mode)) { + pr_error("Can't handle non-regular " + "mapping on %s%s\n", + map_files_path, + vma_file_path); + goto err; + } + + /* + * /dev/zero stands for anon-shared mapping + * otherwise it's some file mapping. + */ + if (MAJOR(st_buf.st_dev) == 0) { + if (!(vma_area->vma.flags & MAP_SHARED)) + goto err_bogus_mapping; + vma_area->vma.status |= VMA_ANON_SHARED; + vma_area->shmid = st_buf.st_ino; + } else { + if (vma_area->vma.flags & MAP_PRIVATE) + vma_area->vma.status |= VMA_FILE_PRIVATE; + else + vma_area->vma.status |= VMA_FILE_SHARED; + } + } else { + /* + * No file but mapping -- anonymous one. + */ + if (vma_area->vma.flags & MAP_SHARED) + goto err_bogus_mapping; + else + vma_area->vma.status |= VMA_ANON_PRIVATE; + } + + list_add_tail(&vma_area->list, vma_area_list); + } + + vma_area = NULL; + ret = 0; + +err: + if (maps) + fclose(maps); + + if (map_files_dir) + closedir(map_files_dir); + + xfree(vma_area); + return ret; + +err_bogus_mapping: + pr_error("Bogus mapping %lx-%lx\n", + vma_area->vma.start, + vma_area->vma.end); + goto err; +} diff --git a/xemul/0003-Image-dumping-via-proc-file.patch b/xemul/0003-Image-dumping-via-proc-file.patch new file mode 100644 index 000000000..8e40b874c --- /dev/null +++ b/xemul/0003-Image-dumping-via-proc-file.patch @@ -0,0 +1,562 @@ +From f7e9d28188e7e2fd0f13f2696f29f20d784cb8fd Mon Sep 17 00:00:00 2001 +From: root <root@ovzept.sw.ru> +Date: Fri, 3 Jun 2011 18:16:10 +0400 +Subject: [PATCH] Image dumping via proc file + +--- + fs/proc/Kconfig | 8 + fs/proc/Makefile | 1 + fs/proc/base.c | 3 + fs/proc/img_dump.c | 397 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/binfmt_img.h | 87 +++++++++ + include/linux/proc_fs.h | 2 + 6 files changed, 498 insertions(+) + create mode 100644 fs/proc/img_dump.c + create mode 100644 include/linux/binfmt_img.h + +Index: linux-2.6.git/fs/proc/Kconfig +=================================================================== +--- linux-2.6.git.orig/fs/proc/Kconfig ++++ linux-2.6.git/fs/proc/Kconfig +@@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. ++ ++config PROC_IMG ++ default y ++ depends on PROC_FS ++ bool "Enable /proc/<pid>/dump file" ++ help ++ Say Y here if you want to be able to produce checkpoint-restore images ++ for tasks via proc +Index: linux-2.6.git/fs/proc/Makefile +=================================================================== +--- linux-2.6.git.orig/fs/proc/Makefile ++++ linux-2.6.git/fs/proc/Makefile +@@ -28,3 +28,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o + proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o + proc-$(CONFIG_PRINTK) += kmsg.o + proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o ++proc-$(CONFIG_PROC_IMG) += img_dump.o +Index: linux-2.6.git/fs/proc/base.c +=================================================================== +--- linux-2.6.git.orig/fs/proc/base.c ++++ linux-2.6.git/fs/proc/base.c +@@ -2983,6 +2983,9 @@ static const struct pid_entry tgid_base_ + #endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tgid_stat), ++#ifdef CONFIG_PROC_IMG ++ REG("dump", S_IRUSR|S_IWUSR, proc_pid_dump_operations), ++#endif + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), + #ifdef CONFIG_NUMA +Index: linux-2.6.git/fs/proc/img_dump.c +=================================================================== +--- /dev/null ++++ linux-2.6.git/fs/proc/img_dump.c +@@ -0,0 +1,397 @@ ++#include <linux/proc_fs.h> ++#include <linux/sched.h> ++#include <linux/uaccess.h> ++#include <linux/binfmt_img.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/highmem.h> ++#include <linux/types.h> ++#include "internal.h" ++ ++static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos) ++{ ++ int ret; ++ static size_t dumped = 0; ++ ++ len -= pos; ++ if (len > size) ++ len = size; ++ ++ ret = copy_to_user(ubuf, buf + pos, len); ++ if (ret) ++ return -EFAULT; ++ ++ dumped += len; ++ return len; ++} ++ ++static int img_dump_header(char __user *buf, size_t size, int pos) ++{ ++ struct binfmt_img_header hdr; ++ ++ hdr.magic = BINFMT_IMG_MAGIC; ++ hdr.version = BINFMT_IMG_VERS_0; ++ ++ return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos); ++} ++ ++static __u16 encode_segment(unsigned short seg) ++{ ++ if (seg == 0) ++ return CKPT_X86_SEG_NULL; ++ BUG_ON((seg & 3) != 3); ++ ++ if (seg == __USER_CS) ++ return CKPT_X86_SEG_USER64_CS; ++ if (seg == __USER_DS) ++ return CKPT_X86_SEG_USER64_DS; ++#ifdef CONFIG_COMPAT ++ if (seg == __USER32_CS) ++ return CKPT_X86_SEG_USER32_CS; ++ if (seg == __USER32_DS) ++ return CKPT_X86_SEG_USER32_DS; ++#endif ++ ++ if (seg & 4) ++ return CKPT_X86_SEG_LDT | (seg >> 3); ++ ++ seg >>= 3; ++ if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX) ++ return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN); ++ ++ printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg); ++ BUG(); ++} ++ ++static __u64 encode_tls(struct desc_struct *d) ++{ ++ return ((__u64)d->a << 32) + d->b; ++} ++ ++static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos) ++{ ++ struct binfmt_regs_image regi; ++ struct pt_regs *regs; ++ int i; ++ ++ regs = task_pt_regs(p); ++ ++ regi.r15 = regs->r15; ++ regi.r14 = regs->r14; ++ regi.r13 = regs->r13; ++ regi.r12 = regs->r12; ++ regi.r11 = regs->r11; ++ regi.r10 = regs->r10; ++ regi.r9 = regs->r9; ++ regi.r8 = regs->r8; ++ regi.ax = regs->ax; ++ regi.orig_ax = regs->orig_ax; ++ regi.bx = regs->bx; ++ regi.cx = regs->cx; ++ regi.dx = regs->dx; ++ regi.si = regs->si; ++ regi.di = regs->di; ++ regi.ip = regs->ip; ++ regi.flags = regs->flags; ++ regi.bp = regs->bp; ++ regi.sp = regs->sp; ++ ++ /* segments */ ++ regi.gsindex = encode_segment(p->thread.gsindex); ++ regi.fsindex = encode_segment(p->thread.fsindex); ++ regi.cs = encode_segment(regs->cs); ++ regi.ss = encode_segment(regs->ss); ++ regi.ds = encode_segment(p->thread.ds); ++ regi.es = encode_segment(p->thread.es); ++ ++ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES); ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) ++ regi.tls[i] = encode_tls(&p->thread.tls_array[i]); ++ ++ if (p->thread.gsindex) ++ regi.gs = 0; ++ else ++ regi.gs = p->thread.gs; ++ ++ if (p->thread.fsindex) ++ regi.fs = 0; ++ else ++ regi.fs = p->thread.fs; ++ ++ return img_dump_buffer(buf, size, ®i, sizeof(regi), pos); ++} ++ ++static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos) ++{ ++ struct binfmt_mm_image mmi; ++ ++ mmi.flags = mm->flags; ++ mmi.def_flags = mm->def_flags; ++ mmi.start_code = mm->start_code; ++ mmi.end_code = mm->end_code; ++ mmi.start_data = mm->start_data; ++ mmi.end_data = mm->end_data; ++ mmi.start_brk = mm->start_brk; ++ mmi.brk = mm->brk; ++ mmi.start_stack = mm->start_stack; ++ mmi.arg_start = mm->arg_start; ++ mmi.arg_end = mm->arg_end; ++ mmi.env_start = mm->env_start; ++ mmi.env_end = mm->env_end; ++ mmi.exe_fd = 0; ++ ++ return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos); ++} ++ ++static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos) ++{ ++ struct binfmt_vma_image vmai; ++ ++ if (vma == NULL) { ++ memset(&vmai, 0, sizeof(vmai)); ++ goto dumpit; ++ } ++ ++ printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm); ++ ++ vmai.fd = 0; ++ vmai.prot = 0; ++ if (vma->vm_flags & VM_READ) ++ vmai.prot |= PROT_READ; ++ if (vma->vm_flags & VM_WRITE) ++ vmai.prot |= PROT_WRITE; ++ if (vma->vm_flags & VM_EXEC) ++ vmai.prot |= PROT_EXEC; ++ ++ vmai.flags = 0; ++ if (vma->vm_file == NULL) ++ vmai.flags |= MAP_ANONYMOUS; ++ if (vma->vm_flags & VM_MAYSHARE) ++ vmai.flags |= MAP_SHARED; ++ else ++ vmai.flags |= MAP_PRIVATE; ++ ++ vmai.start = vma->vm_start; ++ vmai.end = vma->vm_end; ++ vmai.pgoff = vma->vm_pgoff; ++ ++dumpit: ++ return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos); ++} ++ ++static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos) ++{ ++ struct binfmt_page_image pgi; ++ int ret = 0, tmp; ++ ++ pgi.vaddr = addr; ++ ++ if (pos < sizeof(pgi)) { ++ tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos); ++ if (tmp < 0) ++ return tmp; ++ ++ ret = tmp; ++ if (size <= ret) ++ return ret; ++ ++ buf += ret; ++ size -= ret; ++ pos = 0; ++ } else ++ pos -= sizeof(pgi); ++ ++ tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos); ++ if (tmp < 0) ++ return tmp; ++ ++ return ret + tmp; ++} ++ ++static inline int is_private_vma(struct vm_area_struct *vma) ++{ ++ if (vma->vm_file == NULL) ++ return 1; ++ if (!(vma->vm_flags & VM_SHARED)) ++ return 1; ++ return 0; ++} ++ ++static ssize_t do_produce_dump(struct task_struct *p, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ size_t img_pos = 0, img_ppos; ++ size_t produced = 0; ++ int len; ++ loff_t pos = *ppos; ++ struct mm_struct *mm; ++ struct vm_area_struct *vma; ++ ++#define move_pos(); do { \ ++ buf += len; \ ++ produced += len;\ ++ size -= len; \ ++ pos += len; \ ++ } while (0) ++ ++#define seek_pos(__size); do { \ ++ img_ppos = img_pos; \ ++ img_pos += (__size); \ ++ } while (0) ++ ++ /* header */ ++ seek_pos(sizeof(struct binfmt_img_header)); ++ if (pos < img_pos) { ++ len = img_dump_header(buf, size, pos - img_ppos); ++ if (len < 0) ++ goto err; ++ ++ move_pos(); ++ if (size == 0) ++ goto out; ++ } ++ ++ /* registers */ ++ seek_pos(sizeof(struct binfmt_regs_image)); ++ if (pos < img_pos) { ++ len = img_dump_regs(p, buf, size, pos - img_ppos); ++ if (len < 0) ++ goto err; ++ ++ move_pos(); ++ if (size == 0) ++ goto out; ++ } ++ ++ /* memory */ ++ mm = get_task_mm(p); ++ if (mm == NULL) ++ return -EACCES; ++ ++ down_read(&mm->mmap_sem); ++ ++ seek_pos(sizeof(struct binfmt_mm_image)); ++ if (pos < img_pos) { ++ len = img_dump_mm(mm, buf, size, pos - img_ppos); ++ if (len < 0) ++ goto err_mm; ++ ++ move_pos(); ++ if (size == 0) ++ goto out_mm; ++ } ++ ++ vma = mm->mmap; ++ while (1) { ++ seek_pos(sizeof(struct binfmt_vma_image)); ++ if (pos < img_pos) { ++ len = img_dump_vma(vma, buf, size, pos - img_ppos); ++ if (len < 0) ++ goto err_mm; ++ ++ move_pos(); ++ if (size == 0) ++ goto out_mm; ++ } ++ ++ if (vma == NULL) ++ break; ++ ++ vma = vma->vm_next; ++ } ++ ++ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { ++ /* slow and stupid */ ++ unsigned long addr; ++ struct page *page; ++ void *pg_data; ++ ++ if (!is_private_vma(vma)) ++ continue; ++ ++ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { ++ page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET); ++ if (page == NULL) ++ continue; ++ if (IS_ERR(page)) /* huh? */ ++ continue; ++ ++ seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE); ++ if (pos < img_pos) { ++ pg_data = kmap(page); ++ len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos); ++ kunmap(page); ++ ++ if (len < 0) { ++ put_page(page); ++ goto err_mm; ++ } ++ ++ move_pos(); ++ if (size == 0) { ++ put_page(page); ++ goto out_mm; ++ } ++ } ++ ++ put_page(page); ++ } ++ } ++ ++ seek_pos(sizeof(struct binfmt_page_image)); ++ if (pos < img_pos) { ++ struct binfmt_page_image zero; ++ ++ memset(&zero, 0, sizeof(zero)); ++ len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos); ++ if (len < 0) ++ goto err; ++ ++ move_pos(); ++ } ++ ++out_mm: ++ up_read(&mm->mmap_sem); ++ mmput(mm); ++out: ++ *ppos = pos; ++ return produced; ++ ++err_mm: ++ up_read(&mm->mmap_sem); ++ mmput(mm); ++err: ++ return len; ++} ++ ++static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) ++{ ++ struct task_struct *p; ++ ++ p = get_proc_task(file->f_dentry->d_inode); ++ if (p == NULL) ++ return -ESRCH; ++ ++ if (!(p->state & TASK_STOPPED)) { ++ put_task_struct(p); ++ return -EINVAL; ++ } ++ ++ return do_produce_dump(p, buf, size, ppos); ++} ++ ++static int img_dump_open(struct inode *inode, struct file *filp) ++{ ++ return 0; ++} ++ ++static int img_dump_release(struct inode *inode, struct file *filp) ++{ ++ return 0; ++} ++ ++const struct file_operations proc_pid_dump_operations = { ++ .open = img_dump_open, ++ .read = img_dump_read, ++ .release = img_dump_release, ++}; +Index: linux-2.6.git/include/linux/binfmt_img.h +=================================================================== +--- /dev/null ++++ linux-2.6.git/include/linux/binfmt_img.h +@@ -0,0 +1,87 @@ ++#ifndef __BINFMT_IMG_H__ ++#define __BINFMT_IMG_H__ ++ ++#include <linux/types.h> ++ ++struct binfmt_img_header { ++ __u32 magic; ++ __u32 version; ++}; ++ ++#define CKPT_TLS_ENTRIES 3 ++ ++struct binfmt_regs_image { ++ __u64 r15; ++ __u64 r14; ++ __u64 r13; ++ __u64 r12; ++ __u64 r11; ++ __u64 r10; ++ __u64 r9; ++ __u64 r8; ++ __u64 ax; ++ __u64 orig_ax; ++ __u64 bx; ++ __u64 cx; ++ __u64 dx; ++ __u64 si; ++ __u64 di; ++ __u64 ip; ++ __u64 flags; ++ __u64 bp; ++ __u64 sp; ++ ++ __u64 gs; ++ __u64 fs; ++ __u64 tls[CKPT_TLS_ENTRIES]; ++ __u16 gsindex; ++ __u16 fsindex; ++ __u16 cs; ++ __u16 ss; ++ __u16 ds; ++ __u16 es; ++}; ++ ++#define CKPT_X86_SEG_NULL 0 ++#define CKPT_X86_SEG_USER32_CS 1 ++#define CKPT_X86_SEG_USER32_DS 2 ++#define CKPT_X86_SEG_USER64_CS 3 ++#define CKPT_X86_SEG_USER64_DS 4 ++#define CKPT_X86_SEG_TLS 0x4000 ++#define CKPT_X86_SEG_LDT 0x8000 ++ ++struct binfmt_mm_image { ++ __u64 flags; ++ __u64 def_flags; ++ __u64 start_code; ++ __u64 end_code; ++ __u64 start_data; ++ __u64 end_data; ++ __u64 start_brk; ++ __u64 brk; ++ __u64 start_stack; ++ __u64 arg_start; ++ __u64 arg_end; ++ __u64 env_start; ++ __u64 env_end; ++ __u32 exe_fd; ++}; ++ ++struct binfmt_vma_image { ++ __u32 prot; ++ __u32 flags; ++ __u32 pad; ++ __u32 fd; ++ __u64 start; ++ __u64 end; ++ __u64 pgoff; ++}; ++ ++struct binfmt_page_image { ++ __u64 vaddr; ++}; ++ ++#define BINFMT_IMG_MAGIC 0xa75b8d43 ++#define BINFMT_IMG_VERS_0 0x00000100 ++ ++#endif +Index: linux-2.6.git/include/linux/proc_fs.h +=================================================================== +--- linux-2.6.git.orig/include/linux/proc_fs.h ++++ linux-2.6.git/include/linux/proc_fs.h +@@ -102,6 +102,8 @@ struct vmcore { + + #ifdef CONFIG_PROC_FS + ++extern const struct file_operations proc_pid_dump_operations; ++ + extern void proc_root_init(void); + + void proc_flush_task(struct task_struct *task); diff --git a/xemul/0004-Images-execution-binfmt-handler.patch b/xemul/0004-Images-execution-binfmt-handler.patch new file mode 100644 index 000000000..4e6c69e3b --- /dev/null +++ b/xemul/0004-Images-execution-binfmt-handler.patch @@ -0,0 +1,371 @@ +From 0f8e07457aa91e9461665440ca258eb9f93bf2f9 Mon Sep 17 00:00:00 2001 +From: root <root@ovzept.sw.ru> +Date: Fri, 3 Jun 2011 18:16:43 +0400 +Subject: [PATCH] Images execution binfmt handler + +--- + fs/Kconfig.binfmt | 6 + + fs/Makefile | 1 + + fs/binfmt_img.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 331 insertions(+), 0 deletions(-) + create mode 100644 fs/binfmt_img.c + +diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt +index 79e2ca7..0b2f48e 100644 +--- a/fs/Kconfig.binfmt ++++ b/fs/Kconfig.binfmt +@@ -161,3 +161,9 @@ config BINFMT_MISC + You may say M here for module support and later load the module when + you have use for it; the module is called binfmt_misc. If you + don't know what to answer at this point, say Y. ++ ++config BINFMT_IMG ++ tristate "Kernel support for IMG binaries" ++ depends on X86 ++ help ++ Say M/Y here to enable support for checkpoint-restore images execution +diff --git a/fs/Makefile b/fs/Makefile +index fb68c2b..8221719 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -33,6 +33,7 @@ obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o + obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o + obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o + obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o ++obj-$(CONFIG_BINFMT_IMG) += binfmt_img.o + + # binfmt_script is always there + obj-y += binfmt_script.o +diff --git a/fs/binfmt_img.c b/fs/binfmt_img.c +new file mode 100644 +index 0000000..9b09797 +--- /dev/null ++++ b/fs/binfmt_img.c +@@ -0,0 +1,324 @@ ++#include <linux/binfmt_img.h> ++#include <linux/module.h> ++#include <linux/binfmts.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/highmem.h> ++#include <asm/tlbflush.h> ++#include <asm/desc.h> ++ ++/* ++ * The binary handler to save and restore a single task state ++ */ ++ ++static int img_check_header(void *buf) ++{ ++ struct binfmt_img_header *hdr = buf; ++ ++ if (hdr->magic != BINFMT_IMG_MAGIC) ++ return -ENOEXEC; ++ ++ if (hdr->version != BINFMT_IMG_VERS_0) ++ return -EINVAL; ++ ++ return sizeof(*hdr); ++} ++ ++static unsigned short decode_segment(__u16 seg) ++{ ++ if (seg == CKPT_X86_SEG_NULL) ++ return 0; ++ ++ if (seg == CKPT_X86_SEG_USER64_CS) ++ return __USER_CS; ++ if (seg == CKPT_X86_SEG_USER64_DS) ++ return __USER_DS; ++#ifdef CONFIG_COMPAT ++ if (seg == CKPT_X86_SEG_USER32_CS) ++ return __USER32_CS; ++ if (seg == CKPT_X86_SEG_USER32_DS) ++ return __USER32_DS; ++#endif ++ ++ if (seg & CKPT_X86_SEG_TLS) { ++ seg &= ~CKPT_X86_SEG_TLS; ++ return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3; ++ } ++ if (seg & CKPT_X86_SEG_LDT) { ++ seg &= ~CKPT_X86_SEG_LDT; ++ return (seg << 3) | 7; ++ } ++ BUG(); ++} ++ ++static void decode_tls(struct desc_struct *d, __u64 val) ++{ ++ d->a = (unsigned int)(val >> 32); ++ d->b = (unsigned int)(val & 0xFFFFFFFF); ++} ++ ++static int img_restore_regs(struct linux_binprm *bprm, loff_t off, struct pt_regs *regs) ++{ ++ int ret, i; ++ struct binfmt_regs_image regi; ++ struct thread_struct *th = ¤t->thread; ++ unsigned short seg; ++ ++ ret = kernel_read(bprm->file, off, (char *)®i, sizeof(regi)); ++ if (ret != sizeof(regi)) ++ return -EIO; ++ ++ regs->r15 = regi.r15; ++ regs->r14 = regi.r14; ++ regs->r13 = regi.r13; ++ regs->r12 = regi.r12; ++ regs->r11 = regi.r11; ++ regs->r10 = regi.r10; ++ regs->r9 = regi.r9; ++ regs->r8 = regi.r8; ++ regs->ax = regi.ax; ++ regs->orig_ax = regi.orig_ax; ++ regs->bx = regi.bx; ++ regs->cx = regi.cx; ++ regs->dx = regi.dx; ++ regs->si = regi.si; ++ regs->di = regi.di; ++ regs->ip = regi.ip; ++ regs->flags = regi.flags; ++ regs->bp = regi.bp; ++ regs->sp = regi.sp; ++ ++ regs->cs = decode_segment(regi.cs); ++ regs->ss = decode_segment(regi.ss); ++ ++ th->usersp = regi.sp; ++ th->ds = decode_segment(regi.ds); ++ th->es = decode_segment(regi.es); ++ th->fsindex = decode_segment(regi.fsindex); ++ th->gsindex = decode_segment(regi.gsindex); ++ ++ th->fs = regi.fs; ++ th->gs = regi.gs; ++ ++ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES); ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) ++ decode_tls(&th->tls_array[i], regi.tls[i]); ++ ++ load_TLS(th, smp_processor_id()); ++ ++ seg = th->fsindex; ++ loadsegment(fs, seg); ++ savesegment(fs, seg); ++ if (seg != th->fsindex) { ++ printk("ERROR saving fs selector want %x, has %x\n", ++ (unsigned int)th->fsindex, (unsigned int)seg); ++ return -EFAULT; ++ } ++ ++ if (th->fs) ++ wrmsrl(MSR_FS_BASE, th->fs); ++ load_gs_index(th->gsindex); ++ if (th->gs) ++ wrmsrl(MSR_KERNEL_GS_BASE, th->gs); ++ ++ return sizeof(regi); ++} ++ ++static int img_restore_mm(struct linux_binprm *bprm, loff_t off) ++{ ++ int ret; ++ struct binfmt_mm_image mmi; ++ struct mm_struct *mm = current->mm; ++ ++ ret = kernel_read(bprm->file, off, (char *)&mmi, sizeof(mmi)); ++ if (ret != sizeof(mmi)) ++ return -EIO; ++ ++ mm->flags = mmi.flags; ++ mm->def_flags = mmi.def_flags; ++ mm->start_code = mmi.start_code; ++ mm->end_code = mmi.end_code; ++ mm->start_data = mmi.start_data; ++ mm->end_data = mmi.end_data; ++ mm->start_brk = mmi.start_brk; ++ mm->brk = mmi.brk; ++ mm->start_stack = mmi.start_stack; ++ mm->arg_start = mmi.arg_start; ++ mm->arg_end = mmi.arg_end; ++ mm->env_start = mmi.env_start; ++ mm->env_end = mmi.env_end; ++ ++ if (mmi.exe_fd != 0) { ++ struct file *f; ++ ++ f = fget(mmi.exe_fd); ++ if (f == NULL) ++ return -EBADF; ++ ++ fput(mm->exe_file); ++ mm->exe_file = f; ++ } ++ ++ return sizeof(mmi); ++} ++ ++static int img_restore_vmas(struct linux_binprm *bprm, loff_t off) ++{ ++ int ret; ++ struct mm_struct *mm = current->mm; ++ int len = 0; ++ ++ do_munmap(mm, 0, TASK_SIZE); ++ ++ while (1) { ++ struct binfmt_vma_image vmai; ++ unsigned long addr; ++ struct file *file = NULL; ++ ++ len += sizeof(vmai); ++ ++ ret = kernel_read(bprm->file, off, (char *)&vmai, sizeof(vmai)); ++ if (ret != sizeof(vmai)) ++ return -EIO; ++ ++ if (vmai.start == 0 && vmai.end == 0) ++ break; ++ ++ if (vmai.fd != 0) { ++ file = fget(vmai.fd); ++ if (file == NULL) ++ return -EBADF; ++ } else ++ vmai.flags |= MAP_ANONYMOUS; ++ ++ if (vmai.start <= mm->start_stack && vmai.end >= mm->start_stack) ++ vmai.flags |= MAP_GROWSDOWN; ++ ++ addr = do_mmap_pgoff(file, vmai.start, vmai.end - vmai.start, ++ vmai.prot, vmai.flags | MAP_FIXED, vmai.pgoff); ++ ++ if (vmai.fd) { ++ fput(file); ++ do_close(vmai.fd); ++ } ++ ++ if ((long)addr < 0 || (addr != vmai.start)) ++ return -ENXIO; ++ ++ off += sizeof(vmai); ++ } ++ ++ return len; ++} ++ ++static int img_restore_pages(struct linux_binprm *bprm, loff_t off) ++{ ++ int ret; ++ struct mm_struct *mm = current->mm; ++ int len = 0; ++ ++ while (1) { ++ struct binfmt_page_image pgi; ++ struct vm_area_struct *vma; ++ struct page *page; ++ void *pg_data; ++ ++ ret = kernel_read(bprm->file, off, (char *)&pgi, sizeof(pgi)); ++ if (ret != sizeof(pgi)) ++ return -EIO; ++ ++ len += sizeof(pgi); ++ if (pgi.vaddr == 0) ++ break; ++ ++ vma = find_vma(mm, pgi.vaddr); ++ if (vma == NULL) ++ return -ESRCH; ++ ++ ret = get_user_pages(current, current->mm, (unsigned long)pgi.vaddr, ++ 1, 1, 1, &page, NULL); ++ if (ret != 1) ++ return -EFAULT; ++ ++ pg_data = kmap(page); ++ ret = kernel_read(bprm->file, off + sizeof(pgi), pg_data, PAGE_SIZE); ++ kunmap(page); ++ put_page(page); ++ ++ if (ret != PAGE_SIZE) ++ return -EFAULT; ++ ++ len += PAGE_SIZE; ++ off += sizeof(pgi) + PAGE_SIZE; ++ } ++ ++ return len; ++} ++ ++static int img_restore_mem(struct linux_binprm *bprm, loff_t off) ++{ ++ int ret; ++ loff_t len = off; ++ ++ ret = img_restore_mm(bprm, len); ++ if (ret < 0) ++ return ret; ++ ++ len += ret; ++ ret = img_restore_vmas(bprm, len); ++ if (ret < 0) ++ return ret; ++ ++ len += ret; ++ ret = img_restore_pages(bprm, len); ++ if (ret < 0) ++ return ret; ++ ++ len += ret; ++ return len; ++ ++} ++ ++static int img_load_binary(struct linux_binprm * bprm, struct pt_regs * regs) ++{ ++ int ret; ++ loff_t len = 0; ++ ++ ret = img_check_header(bprm->buf); ++ if (ret < 0) ++ return ret; ++ ++ len += ret; ++ ret = img_restore_regs(bprm, len, regs); ++ if (ret < 0) ++ return ret; ++ ++ len += ret; ++ ret = img_restore_mem(bprm, len); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static struct linux_binfmt img_binfmt = { ++ .module = THIS_MODULE, ++ .load_binary = img_load_binary, ++}; ++ ++static __init int img_binfmt_init(void) ++{ ++ return register_binfmt(&img_binfmt); ++} ++ ++static __exit void img_binfmt_exit(void) ++{ ++ unregister_binfmt(&img_binfmt); ++} ++ ++module_init(img_binfmt_init); ++module_exit(img_binfmt_exit); ++MODULE_LICENSE("GPL"); +-- +1.5.5.6 + diff --git a/xemul/binfmt_img.h b/xemul/binfmt_img.h new file mode 100644 index 000000000..8775d92ab --- /dev/null +++ b/xemul/binfmt_img.h @@ -0,0 +1,96 @@ +#ifndef __BINFMT_IMG_H__ +#define __BINFMT_IMG_H__ + +#include <linux/types.h> + +#define __packed __attribute__((packed)) + +struct binfmt_img_header { + __u32 magic; + __u32 version; + __u16 arch; + __u16 flags; +} __packed; + +#define CKPT_TLS_ENTRIES 3 + +struct binfmt_regs_image { + union { + struct { + __u64 r15; + __u64 r14; + __u64 r13; + __u64 r12; + __u64 r11; + __u64 r10; + __u64 r9; + __u64 r8; + __u64 ax; + __u64 orig_ax; + __u64 bx; + __u64 cx; + __u64 dx; + __u64 si; + __u64 di; + __u64 ip; + __u64 flags; + __u64 bp; + __u64 sp; + + __u64 gs; + __u64 fs; + __u64 tls[CKPT_TLS_ENTRIES]; + __u16 gsindex; + __u16 fsindex; + __u16 cs; + __u16 ss; + __u16 ds; + __u16 es; + } r; + __u64 dummy[32]; + }; +} __packed; + +#define CKPT_X86_SEG_NULL 0 +#define CKPT_X86_SEG_USER32_CS 1 +#define CKPT_X86_SEG_USER32_DS 2 +#define CKPT_X86_SEG_USER64_CS 3 +#define CKPT_X86_SEG_USER64_DS 4 +#define CKPT_X86_SEG_TLS 0x4000 +#define CKPT_X86_SEG_LDT 0x8000 + +struct binfmt_mm_image { + __u64 flags; + __u64 def_flags; + __u64 start_code; + __u64 end_code; + __u64 start_data; + __u64 end_data; + __u64 start_brk; + __u64 brk; + __u64 start_stack; + __u64 arg_start; + __u64 arg_end; + __u64 env_start; + __u64 env_end; + __u32 exe_fd; +} __packed; + +struct binfmt_vma_image { + __u32 prot; + __u32 flags; + __u32 pad; + __u32 fd; + __u64 start; + __u64 end; + __u64 pgoff; +} __packed; + +struct binfmt_page_image { + __u64 vaddr; +} __packed; + +#define BINFMT_IMG_MAGIC 0xa75b8d43 +#define BINFMT_IMG_VERS_0 0x00000100 + +#endif diff --git a/xemul/cr-dump.c b/xemul/cr-dump.c new file mode 100644 index 000000000..01154e9f8 --- /dev/null +++ b/xemul/cr-dump.c @@ -0,0 +1,781 @@ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <dirent.h> +#include <string.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <errno.h> +#include <linux/kdev_t.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/vfs.h> + +#include <linux/types.h> +#include "img_structs.h" + +static int fdinfo_img; +static int pages_img; +static int core_img; +static int shmem_img; +static int pipes_img; + +#define PIPEFS_MAGIC 0x50495045 + +static int prep_img_files(int pid) +{ + __u32 type; + char name[64]; + + sprintf(name, "fdinfo-%d.img", pid); + fdinfo_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (fdinfo_img < 0) { + perror("Can't open fdinfo"); + return 1; + } + + type = FDINFO_MAGIC; + write(fdinfo_img, &type, 4); + + sprintf(name, "pages-%d.img", pid); + pages_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (pages_img < 0) { + perror("Can't open shmem"); + return 1; + } + + type = PAGES_MAGIC; + write(pages_img, &type, 4); + + sprintf(name, "core-%d.img", pid); + core_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (core_img < 0) { + perror("Can't open core"); + return 1; + } + + sprintf(name, "shmem-%d.img", pid); + shmem_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (shmem_img < 0) { + perror("Can't open shmem"); + return 1; + } + + type = SHMEM_MAGIC; + write(shmem_img, &type, 4); + + sprintf(name, "pipes-%d.img", pid); + pipes_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (pipes_img < 0) { + perror("Can't open pipes"); + return 1; + } + + type = PIPES_MAGIC; + write(pipes_img, &type, 4); + + return 0; +} + +static void kill_imgfiles(int pid) +{ + /* FIXME */ +} + +static int stop_task(int pid) +{ + return kill(pid, SIGSTOP); +} + +static void continue_task(int pid) +{ + if (kill(pid, SIGCONT)) + perror("Can't cont task"); +} + +static char big_tmp_str[PATH_MAX]; + +static int read_fd_params(int pid, char *fd, unsigned long *pos, unsigned int *flags) +{ + char fd_str[128]; + int ifd; + + sprintf(fd_str, "/proc/%d/fdinfo/%s", pid, fd); + + printf("\tGetting fdinfo for fd %s\n", fd); + ifd = open(fd_str, O_RDONLY); + if (ifd < 0) { + perror("Can't open fdinfo"); + return 1; + } + + read(ifd, big_tmp_str, sizeof(big_tmp_str)); + close(ifd); + + sscanf(big_tmp_str, "pos:\t%lli\nflags:\t%o\n", pos, flags); + return 0; +} + +static int dump_one_reg_file(int type, unsigned long fd_name, int lfd, + int lclose, unsigned long pos, unsigned int flags) +{ + char fd_str[128]; + int len; + struct fdinfo_entry e; + + sprintf(fd_str, "/proc/self/fd/%d", lfd); + len = readlink(fd_str, big_tmp_str, sizeof(big_tmp_str) - 1); + if (len < 0) { + perror("Can't readlink fd"); + return 1; + } + + big_tmp_str[len] = '\0'; + printf("\tDumping path for %x fd via self %d [%s]\n", fd_name, lfd, big_tmp_str); + + if (lclose) + close(lfd); + + e.type = type; + e.addr = fd_name; + e.len = len; + e.pos = pos; + e.flags = flags; + + write(fdinfo_img, &e, sizeof(e)); + write(fdinfo_img, big_tmp_str, len); + + return 0; +} + +#define MAX_PIPE_BUF_SIZE 1024 /* FIXME - this is not so */ +#define SPLICE_F_NONBLOCK 0x2 + +static int dump_pipe_and_data(int lfd, struct pipes_entry *e) +{ + int steal_pipe[2]; + int ret; + + printf("\tDumping data from pipe %x\n", e->pipeid); + if (pipe(steal_pipe) < 0) { + perror("Can't create pipe for stealing data"); + return 1; + } + + ret = tee(lfd, steal_pipe[1], MAX_PIPE_BUF_SIZE, SPLICE_F_NONBLOCK); + if (ret < 0) { + if (errno != EAGAIN) { + perror("Can't pick pipe data"); + return 1; + } + + ret = 0; + } + + e->bytes = ret; + write(pipes_img, e, sizeof(*e)); + + if (ret) { + ret = splice(steal_pipe[0], NULL, pipes_img, NULL, ret, 0); + if (ret < 0) { + perror("Can't push pipe data"); + return 1; + } + } + + close(steal_pipe[0]); + close(steal_pipe[1]); + return 0; +} + +static int dump_one_pipe(int fd, int lfd, unsigned int id, unsigned int flags) +{ + struct pipes_entry e; + + printf("\tDumping pipe %d/%x flags %x\n", fd, id, flags); + + e.fd = fd; + e.pipeid = id; + e.flags = flags; + + if (flags & O_WRONLY) { + e.bytes = 0; + write(pipes_img, &e, sizeof(e)); + return 0; + } + + return dump_pipe_and_data(lfd, &e); +} + +static int dump_one_fd(int dir, char *fd_name, unsigned long pos, unsigned int flags) +{ + int fd; + struct stat st_buf; + struct statfs stfs_buf; + + printf("\tDumping fd %s\n", fd_name); + fd = openat(dir, fd_name, O_RDONLY); + if (fd == -1) { + printf("Tried to openat %d/%d %s\n", getpid(), dir, fd_name); + perror("Can't open fd"); + return 1; + } + + if (fstat(fd, &st_buf) < 0) { + perror("Can't stat one"); + return 1; + } + + if (S_ISREG(st_buf.st_mode)) + return dump_one_reg_file(FDINFO_FD, atoi(fd_name), fd, 1, pos, flags); + + if (S_ISFIFO(st_buf.st_mode)) { + if (fstatfs(fd, &stfs_buf) < 0) { + perror("Can't statfs one"); + return 1; + } + + if (stfs_buf.f_type == PIPEFS_MAGIC) + return dump_one_pipe(atoi(fd_name), fd, st_buf.st_ino, flags); + } + + if (!strcmp(fd_name, "0")) { + printf("\tSkipping stdin\n"); + return 0; + } + + if (!strcmp(fd_name, "1")) { + printf("\tSkipping stdout\n"); + return 0; + } + + if (!strcmp(fd_name, "2")) { + printf("\tSkipping stderr\n"); + return 0; + } + + if (!strcmp(fd_name, "3")) { + printf("\tSkipping tty\n"); + return 0; + } + + fprintf(stderr, "Can't dump file %s of that type [%x]\n", fd_name, st_buf.st_mode); + return 1; + +} + +static int dump_task_files(int pid) +{ + char pid_fd_dir[64]; + DIR *fd_dir; + struct dirent *de; + unsigned long pos; + unsigned int flags; + + printf("Dumping open files for %d\n", pid); + + sprintf(pid_fd_dir, "/proc/%d/fd", pid); + fd_dir = opendir(pid_fd_dir); + if (fd_dir == NULL) { + perror("Can't open fd dir"); + return -1; + } + + while ((de = readdir(fd_dir)) != NULL) { + if (de->d_name[0] == '.') + continue; + + if (read_fd_params(pid, de->d_name, &pos, &flags)) + return 1; + + if (dump_one_fd(dirfd(fd_dir), de->d_name, pos, flags)) + return 1; + } + + closedir(fd_dir); + return 0; +} + +#define PAGE_SIZE 4096 +#define PAGE_RSS 0x1 + +static unsigned long rawhex(char *str, char **end) +{ + unsigned long ret = 0; + + while (1) { + if (str[0] >= '0' && str[0] <= '9') { + ret <<= 4; + ret += str[0] - '0'; + } else if (str[0] >= 'a' && str[0] <= 'f') { + ret <<= 4; + ret += str[0] - 'a' + 0xA; + } else if (str[0] >= 'A' && str[0] <= 'F') { + ret <<= 4; + ret += str[0] - 'A' + 0xA; + } else { + if (end) + *end = str; + return ret; + } + + str++; + } +} + +static void map_desc_parm(char *desc, unsigned long *pgoff, unsigned long *len) +{ + char *s; + unsigned long start, end; + + start = rawhex(desc, &s); + if (*s != '-') { + goto bug; + } + + end = rawhex(s + 1, &s); + if (*s != ' ') { + goto bug; + } + + s = strchr(s + 1, ' '); + *pgoff = rawhex(s + 1, &s); + if (*s != ' ') { + goto bug; + } + + if (start > end) + goto bug; + + *len = end - start; + + if (*len % PAGE_SIZE) { + goto bug; + } + if (*pgoff % PAGE_SIZE) { + goto bug; + } + + return; +bug: + fprintf(stderr, "BUG\n"); + exit(1); +} + +static int dump_map_pages(int lfd, unsigned long start, unsigned long pgoff, unsigned long len) +{ + unsigned int nrpages, pfn; + void *mem; + unsigned char *mc; + + printf("\t\tDumping pages start %x len %x off %x\n", start, len, pgoff); + mem = mmap(NULL, len, PROT_READ, MAP_FILE | MAP_PRIVATE, lfd, pgoff); + if (mem == MAP_FAILED) { + perror("Can't map"); + return 1; + } + + nrpages = len / PAGE_SIZE; + mc = malloc(nrpages); + if (mincore(mem, len, mc)) { + perror("Can't mincore mapping"); + return 1; + } + + for (pfn = 0; pfn < nrpages; pfn++) + if (mc[pfn] & PAGE_RSS) { + __u64 vaddr; + + vaddr = start + pfn * PAGE_SIZE; + write(pages_img, &vaddr, 8); + write(pages_img, mem + pfn * PAGE_SIZE, PAGE_SIZE); + } + + munmap(mem, len); + + return 0; +} + +static int dump_anon_private_map(char *start) +{ + printf("\tSkipping anon private mapping at %s\n", start); + return 0; +} + +static int dump_anon_shared_map(char *_start, char *mdesc, int lfd, struct stat *st) +{ + unsigned long pgoff, len; + struct shmem_entry e; + unsigned long start; + struct stat buf; + + map_desc_parm(mdesc, &pgoff, &len); + + start = rawhex(_start, NULL); + e.start = start; + e.end = start + len; + e.shmid = st->st_ino; + + write(shmem_img, &e, sizeof(e)); + + if (dump_map_pages(lfd, start, pgoff, len)) + return 1; + + close(lfd); + return 0; +} + +static int dump_file_shared_map(char *start, char *mdesc, int lfd) +{ + printf("\tSkipping file shared mapping at %s\n", start); + close(lfd); + return 0; +} + +static int dump_file_private_map(char *_start, char *mdesc, int lfd) +{ + unsigned long pgoff, len; + unsigned long start; + + map_desc_parm(mdesc, &pgoff, &len); + + start = rawhex(_start, NULL); + if (dump_one_reg_file(FDINFO_MAP, start, lfd, 0, 0, O_RDONLY)) + return 1; + + close(lfd); + return 0; +} + +static int dump_one_mapping(char *mdesc, DIR *mfd_dir) +{ + char *flags, *tmp; + char map_start[32]; + int lfd; + struct stat st_buf; + + tmp = strchr(mdesc, '-'); + memset(map_start, 0, sizeof(map_start)); + strncpy(map_start, mdesc, tmp - mdesc); + flags = strchr(mdesc, ' '); + flags++; + + printf("\tDumping %s\n", map_start); + lfd = openat(dirfd(mfd_dir), map_start, O_RDONLY); + if (lfd == -1) { + if (errno != ENOENT) { + perror("Can't open mapping"); + return 1; + } + + if (flags[3] != 'p') { + fprintf(stderr, "Bogus mapping [%s]\n", mdesc); + return 1; + } + + return dump_anon_private_map(map_start); + } + + if (fstat(lfd, &st_buf) < 0) { + perror("Can't stat mapping!"); + return 1; + } + + if (!S_ISREG(st_buf.st_mode)) { + perror("Can't handle non-regular mapping"); + return 1; + } + + if (MAJOR(st_buf.st_dev) == 0) { + if (flags[3] != 's') { + fprintf(stderr, "Bogus mapping [%s]\n", mdesc); + return 1; + } + + /* FIXME - this can be tmpfs visible file mapping */ + return dump_anon_shared_map(map_start, mdesc, lfd, &st_buf); + } + + if (flags[3] == 'p') + return dump_file_private_map(map_start, mdesc, lfd); + else + return dump_file_shared_map(map_start, mdesc, lfd); +} + +static int dump_task_ext_mm(int pid) +{ + char path[64]; + DIR *mfd_dir; + FILE *maps; + + printf("Dumping mappings for %d\n", pid); + + sprintf(path, "/proc/%d/mfd", pid); + mfd_dir = opendir(path); + if (mfd_dir == NULL) { + perror("Can't open mfd dir"); + return -1; + } + + sprintf(path, "/proc/%d/maps", pid); + maps = fopen(path, "r"); + if (maps == NULL) { + perror("Can't open maps file"); + return 1; + } + + while (fgets(big_tmp_str, sizeof(big_tmp_str), maps) != NULL) + if (dump_one_mapping(big_tmp_str, mfd_dir)) + return 1; + + fclose(maps); + closedir(mfd_dir); + return 0; +} + +static int dump_task_state(int pid) +{ + char path[64]; + int dump_fd; + void *mem; + + printf("Dumping task image for %d\n", pid); + sprintf(path, "/proc/%d/kstate_dump", pid); + dump_fd = open(path, O_RDONLY); + if (dump_fd < 0) { + perror("Can't open dump file"); + return 1; + } + + mem = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0); + if (mem == MAP_FAILED) { + perror("Can't get mem"); + return 1; + } + + while (1) { + int r, w; + + r = read(dump_fd, mem, 4096); + if (r == 0) + break; + if (r < 0) { + perror("Can't read dump file"); + return 1; + } + + w = 0; + while (w < r) { + int ret; + + ret = write(core_img, mem + w, r - w); + if (ret <= 0) { + perror("Can't write core"); + return 1; + } + + w += ret; + } + } + + munmap(mem, 4096); + close(dump_fd); + + return 0; +} + +static int dump_one_task(int pid, int stop) +{ + printf("Dumping task %d\n", pid); + + if (prep_img_files(pid)) + return 1; + + if (stop && stop_task(pid)) + goto err_task; + + if (dump_task_files(pid)) + goto err; + + if (dump_task_ext_mm(pid)) + goto err; + + if (dump_task_state(pid)) + goto err; + + if (stop) + continue_task(pid); + + printf("Dump is complete\n"); + return 0; + +err: + if (stop) + continue_task(pid); +err_task: + kill_imgfiles(pid); + return 1; +} + +static int pstree_fd; +static char big_tmp_str[4096]; +static int *pids, nr_pids; + +static char *get_children_pids(int pid) +{ + FILE *f; + int len; + char *ret, *tmp; + + sprintf(big_tmp_str, "/proc/%d/status", pid); + f = fopen(big_tmp_str, "r"); + if (f == NULL) + return NULL; + + while ((fgets(big_tmp_str, sizeof(big_tmp_str), f)) != NULL) { + if (strncmp(big_tmp_str, "Children:", 9)) + continue; + + tmp = big_tmp_str + 10; + len = strlen(tmp); + ret = malloc(len + 1); + strcpy(ret, tmp); + if (len) + ret[len - 1] = ' '; + + fclose(f); + return ret; + } + + fclose(f); + return NULL; +} + +static int dump_pid_and_children(int pid) +{ + struct pstree_entry e; + char *chlist, *tmp, *tmp2; + + printf("\tReading %d children list\n", pid); + chlist = get_children_pids(pid); + if (chlist == NULL) + return 1; + + printf("\t%d has children %s\n", pid, chlist); + + e.pid = pid; + e.nr_children = 0; + + pids = realloc(pids, (nr_pids + 1) * sizeof(int)); + pids[nr_pids++] = e.pid; + + tmp = chlist; + while ((tmp = strchr(tmp, ' ')) != NULL) { + tmp++; + e.nr_children++; + } + + write(pstree_fd, &e, sizeof(e)); + tmp = chlist; + while (1) { + __u32 cpid; + + cpid = strtol(tmp, &tmp, 10); + if (cpid == 0) + break; + if (*tmp != ' ') { + fprintf(stderr, "Error in string with children!\n"); + return 1; + } + + write(pstree_fd, &cpid, sizeof(cpid)); + tmp++; + } + + tmp = chlist; + while ((tmp2 = strchr(tmp, ' ')) != NULL) { + *tmp2 = '\0'; + if (dump_pid_and_children(atoi(tmp))) + return 1; + tmp = tmp2 + 1; + } + + free(chlist); + return 0; +} + +static int __dump_all_tasks(void) +{ + int i, pid; + + printf("Dumping tasks' images for"); + for (i = 0; i < nr_pids; i++) + printf(" %d", pids[i]); + printf("\n"); + + printf("Stopping tasks\n"); + for (i = 0; i < nr_pids; i++) + if (stop_task(pids[i])) + goto err; + + for (i = 0; i < nr_pids; i++) { + if (dump_one_task(pids[i], 0)) + goto err; + } + + printf("Resuming tasks\n"); + for (i = 0; i < nr_pids; i++) + continue_task(pids[i]); + + return 0; + +err: + for (i = 0; i < nr_pids; i++) + continue_task(pids[i]); + return 1; + +} + +static int dump_all_tasks(int pid) +{ + char *chlist; + __u32 type; + + pids = NULL; + nr_pids = 0; + + printf("Dumping process tree, start from %d\n", pid); + + sprintf(big_tmp_str, "pstree-%d.img", pid); + pstree_fd = open(big_tmp_str, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (pstree_fd < 0) { + perror("Can't create pstree"); + return 1; + } + + type = PSTREE_MAGIC; + write(pstree_fd, &type, sizeof(type)); + + if (dump_pid_and_children(pid)) + return 1; + + close(pstree_fd); + + return __dump_all_tasks(); +} + +int main(int argc, char **argv) +{ + if (argc != 3) + goto usage; + if (argv[1][0] != '-') + goto usage; + if (argv[1][1] == 'p') + return dump_one_task(atoi(argv[2]), 1); + if (argv[1][1] == 't') + return dump_all_tasks(atoi(argv[2])); + +usage: + printf("Usage: %s (-p|-t) <pid>\n", argv[0]); + return 1; +} diff --git a/xemul/cr-restore.c b/xemul/cr-restore.c new file mode 100644 index 000000000..d8cedb01f --- /dev/null +++ b/xemul/cr-restore.c @@ -0,0 +1,1115 @@ +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <dirent.h> +#include <string.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <errno.h> +#include <linux/kdev_t.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/sendfile.h> + +#define PAGE_SIZE 4096 + +#include <linux/types.h> +#include "img_structs.h" +#include "binfmt_img.h" + +struct fmap_fd { + unsigned long start; + int fd; + struct fmap_fd *next; +}; + +static struct fmap_fd *fmap_fds; + +struct shmem_info { + unsigned long start; + unsigned long end; + unsigned long id; + int pid; + int real_pid; +}; + +static struct shmem_info *shmems; +static int nr_shmems; + +struct pipes_info { + unsigned int id; + int pid; + int real_pid; + int read_fd; + int write_fd; + int users; +}; + +static struct pipes_info *pipes; +static int nr_pipes; + +static int restore_task_with_children(int my_pid, char *pstree_path); + +#define CLONE_CHILD_USEPID 0x02000000 + +static void show_saved_shmems(void) +{ + int i; + + printf("\tSaved shmems:\n"); + for (i = 0; i < nr_shmems; i++) + printf("\t\t%016lx %lx %d\n", shmems[i].start, shmems[i].id, shmems[i].pid); +} + +static void show_saved_pipes(void) +{ + int i; + + printf("\tSaved pipes:\n"); + for (i = 0; i < nr_pipes; i++) + printf("\t\t%x -> %d\n", pipes[i].id, pipes[i].pid); +} + +static struct shmem_info *search_shmem(unsigned long addr, unsigned long id) +{ + int i; + + for (i = 0; i < nr_shmems; i++) { + struct shmem_info *si; + + si = shmems + i; + if (si->start <= addr && si->end >= addr && si->id == id) + return si; + } + + return NULL; +} + +static struct pipes_info *search_pipes(unsigned int pipeid) +{ + int i; + + for (i = 0; i < nr_pipes; i++) { + struct pipes_info *pi; + + pi = pipes + i; + if (pi->id == pipeid) + return pi; + } + + return NULL; +} + +static void shmem_update_real_pid(int vpid, int rpid) +{ + int i; + + for (i = 0; i < nr_shmems; i++) + if (shmems[i].pid == vpid) + shmems[i].real_pid = rpid; +} + +static int shmem_wait_and_open(struct shmem_info *si) +{ + /* FIXME - not good */ + char path[128]; + unsigned long time = 1000; + + sleep(1); + + while (si->real_pid == 0) + usleep(time); + + sprintf(path, "/proc/%d/mfd/0x%lx", si->real_pid, si->start); + while (1) { + int ret; + + ret = open(path, O_RDWR); + if (ret > 0) + return ret; + + if (ret < 0 && errno != ENOENT) { + perror(" Can't stat shmem"); + return -1; + } + + printf("Waiting for [%s] to appear\n", path); + if (time < 20000000) + time <<= 1; + usleep(time); + } +} + +static int try_to_add_shmem(int pid, struct shmem_entry *e) +{ + int i; + + for (i = 0; i < nr_shmems; i++) { + if (shmems[i].start != e->start || shmems[i].id != e->shmid) + continue; + + if (shmems[i].end != e->end) { + printf("Bogus shmem\n"); + return 1; + } + + if (shmems[i].pid > pid) + shmems[i].pid = pid; + + return 0; + } + + if ((nr_shmems + 1) * sizeof(struct shmem_info) >= 4096) { + printf("OOM storing shmems\n"); + return 1; + } + + shmems[nr_shmems].start = e->start; + shmems[nr_shmems].end = e->end; + shmems[nr_shmems].id = e->shmid; + shmems[nr_shmems].pid = pid; + shmems[nr_shmems].real_pid = 0; + nr_shmems++; + + return 0; +} + +static int try_to_add_pipe(int pid, struct pipes_entry *e, int p_fd) +{ + int i; + + for (i = 0; i < nr_pipes; i++) { + if (pipes[i].id != e->pipeid) + continue; + + if (pipes[i].pid > pid) + pipes[i].pid = pid; + pipes[i].users++; + + return 0; + } + + if ((nr_pipes + 1) * sizeof(struct pipes_info) >= 4096) { + printf("OOM storing pipes\n"); + return 1; + } + + pipes[nr_pipes].id = e->pipeid; + pipes[nr_pipes].pid = pid; + pipes[nr_pipes].real_pid = 0; + pipes[nr_pipes].read_fd = 0; + pipes[nr_pipes].write_fd = 0; + pipes[nr_pipes].users = 1; + nr_pipes++; + + return 0; +} + +static int prepare_shmem_pid(int pid) +{ + char path[64]; + int sh_fd; + __u32 type = 0; + + sprintf(path, "shmem-%d.img", pid); + sh_fd = open(path, O_RDONLY); + if (sh_fd < 0) { + perror("Can't open shmem info"); + return 1; + } + + read(sh_fd, &type, sizeof(type)); + if (type != SHMEM_MAGIC) { + perror("Bad shmem magic"); + return 1; + } + + while (1) { + struct shmem_entry e; + int ret; + + ret = read(sh_fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + perror("Can't read shmem entry"); + return 1; + } + + if (try_to_add_shmem(pid, &e)) + return 1; + } + + close(sh_fd); + return 0; +} + +static int prepare_pipes_pid(int pid) +{ + char path[64]; + int p_fd; + __u32 type = 0; + + sprintf(path, "pipes-%d.img", pid); + p_fd = open(path, O_RDONLY); + if (p_fd < 0) { + perror("Can't open pipes image"); + return 1; + } + + read(p_fd, &type, sizeof(type)); + if (type != PIPES_MAGIC) { + perror("Bad pipes magin"); + return 1; + } + + while (1) { + struct pipes_entry e; + int ret; + + ret = read(p_fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + fprintf(stderr, "Read pipes for %s failed %d of %d read\n", + path, ret, sizeof(e)); + perror("Can't read pipes entry"); + return 1; + } + + if (try_to_add_pipe(pid, &e, p_fd)) + return 1; + + lseek(p_fd, e.bytes, SEEK_CUR); + } + + close(p_fd); + return 0; +} + +static int prepare_shared(int ps_fd) +{ + printf("Preparing info about shared resources\n"); + + nr_shmems = 0; + shmems = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); + if (shmems == MAP_FAILED) { + perror("Can't map shmems"); + return 1; + } + + pipes = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0); + if (pipes == MAP_FAILED) { + perror("Can't map pipes"); + return 1; + } + + while (1) { + struct pstree_entry e; + int ret; + + ret = read(ps_fd, &e, sizeof(e)); + if (ret == 0) + break; + + if (ret != sizeof(e)) { + perror("Can't read ps"); + return 1; + } + + if (prepare_shmem_pid(e.pid)) + return 1; + + if (prepare_pipes_pid(e.pid)) + return 1; + + lseek(ps_fd, e.nr_children * sizeof(__u32), SEEK_CUR); + } + + lseek(ps_fd, sizeof(__u32), SEEK_SET); + + show_saved_shmems(); + show_saved_pipes(); + + return 0; +} + +static struct fmap_fd *pop_fmap_fd(unsigned long start) +{ + struct fmap_fd **p, *r; + + printf("Looking for %lx : ", start); + + for (p = &fmap_fds; *p != NULL; p = &(*p)->next) { + if ((*p)->start != start) + continue; + + r = *p; + *p = r->next; + printf("found\n"); + return r; + } + + printf("not found\n"); + return NULL; +} + +static int open_fe_fd(struct fdinfo_entry *fe, int fd) +{ + char path[PATH_MAX]; + int tmp; + + if (read(fd, path, fe->len) != fe->len) { + fprintf(stderr, "Error reading path"); + return -1; + } + + path[fe->len] = '\0'; + + tmp = open(path, fe->flags); + if (tmp < 0) { + perror("Can't open file"); + return -1; + } + + lseek(tmp, fe->pos, SEEK_SET); + + return tmp; +} + +static int reopen_fd(int old_fd, int new_fd) +{ + int tmp; + + if (old_fd != new_fd) { + tmp = dup2(old_fd, new_fd); + if (tmp < 0) + return tmp; + + close(old_fd); + } + + return new_fd; +} + +static int open_fd(int pid, struct fdinfo_entry *fe, int *cfd) +{ + int fd, tmp; + + if (*cfd == (int)fe->addr) { + tmp = dup(*cfd); + if (tmp < 0) { + perror("Can't dup file"); + return 1; + } + + printf("%s: Dup for %d\n", __func__, tmp); + + *cfd = tmp; + } + + tmp = open_fe_fd(fe, *cfd); + if (tmp < 0) + return 1; + + fd = reopen_fd(tmp, (int)fe->addr); + if (fd < 0) { + perror("Can't dup"); + return 1; + } + + return 0; +} + +static int open_fmap(int pid, struct fdinfo_entry *fe, int fd) +{ + int tmp; + struct fmap_fd *new; + + tmp = open_fe_fd(fe, fd); + if (tmp < 0) + return 1; + + printf("%d:\t\tWill map %lx to %d\n", pid, (unsigned long)fe->addr, tmp); + new = malloc(sizeof(*new)); + new->start = fe->addr; + new->fd = tmp; + new->next = fmap_fds; + fmap_fds = new; + + return 0; +} + +static int prepare_fds(int pid) +{ + __u32 mag; + char path[64]; + int fdinfo_fd; + + printf("%d: Opening files\n", pid); + + sprintf(path, "fdinfo-%d.img", pid); + fdinfo_fd = open(path, O_RDONLY); + if (fdinfo_fd < 0) { + perror("Can't open fdinfo"); + return 1; + } + + read(fdinfo_fd, &mag, 4); + if (mag != FDINFO_MAGIC) { + fprintf(stderr, "Bad file\n"); + return 1; + } + + while (1) { + int ret; + struct fdinfo_entry fe; + + ret = read(fdinfo_fd, &fe, sizeof(fe)); + if (ret == 0) { + close(fdinfo_fd); + return 0; + } + + if (ret < 0) { + perror("Can't read file"); + return 1; + } + if (ret != sizeof(fe)) { + fprintf(stderr, "Error reading\n"); + return 1; + } + + printf("\t%d: Got fd for %lx type %d namelen %d\n", pid, + (unsigned long)fe.addr, fe.type, fe.len); + switch (fe.type) { + case FDINFO_FD: + if (open_fd(pid, &fe, &fdinfo_fd)) + return 1; + + break; + case FDINFO_MAP: + if (open_fmap(pid, &fe, fdinfo_fd)) + return 1; + + break; + default: + fprintf(stderr, "Some bullshit in a file\n"); + return 1; + } + } +} + +struct shmem_to_id { + unsigned long addr; + unsigned long end; + unsigned long id; + struct shmem_to_id *next; +}; + +static struct shmem_to_id *my_shmem_ids; + +static unsigned long find_shmem_id(unsigned long addr) +{ + struct shmem_to_id *si; + + for (si = my_shmem_ids; si != NULL; si = si->next) + if (si->addr <= addr && si->end >= addr) + return si->id; + + return 0; +} + +static void save_shmem_id(struct shmem_entry *e) +{ + struct shmem_to_id *si; + + si = malloc(sizeof(*si)); + si->addr = e->start; + si->end = e->end; + si->id = e->shmid; + si->next = my_shmem_ids; + my_shmem_ids = si; +} + +static int prepare_shmem(int pid) +{ + char path[64]; + int sh_fd; + __u32 type = 0; + + sprintf(path, "shmem-%d.img", pid); + sh_fd = open(path, O_RDONLY); + if (sh_fd < 0) { + perror("Can't open shmem info"); + return 1; + } + + read(sh_fd, &type, sizeof(type)); + if (type != SHMEM_MAGIC) { + perror("Bad shmem magic"); + return 1; + } + + while (1) { + struct shmem_entry e; + int ret; + + ret = read(sh_fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + perror("Can't read shmem entry"); + return 1; + } + + save_shmem_id(&e); + } + + close(sh_fd); + return 0; +} + +static int try_fixup_file_map(int pid, struct binfmt_vma_image *vi, int fd) +{ + struct fmap_fd *fmfd; + + fmfd = pop_fmap_fd(vi->start); + if (fmfd != NULL) { + printf("%d: Fixing %lx vma to %d fd\n", pid, vi->start, fmfd->fd); + lseek(fd, -sizeof(*vi), SEEK_CUR); + printf("%d: Wrote %d\n", fmfd->fd); + vi->fd = fmfd->fd; + if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) { + perror("Can't write img"); + return 1; + } + free(fmfd); + } + + return 0; +} + +static int try_fixup_shared_map(int pid, struct binfmt_vma_image *vi, int fd) +{ + struct shmem_info *si; + unsigned long id; + + id = find_shmem_id(vi->start); + if (id == 0) + return 0; + + si = search_shmem(vi->start, id); + printf("%d: Search for %016lx shmem %p/%d\n", pid, vi->start, si, si ? si->pid : -1); + + if (si == NULL) { + fprintf(stderr, "Can't find my shmem %016lx\n", vi->start); + return 1; + } + + if (si->pid != pid) { + int sh_fd; + + sh_fd = shmem_wait_and_open(si); + printf("%d: Fixing %lx vma to %x/%d shmem -> %d\n", pid, vi->start, si->id, si->pid, sh_fd); + if (fd < 0) { + perror("Can't open shmem"); + return 1; + } + + lseek(fd, -sizeof(*vi), SEEK_CUR); + vi->fd = sh_fd; + if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) { + perror("Can't write img"); + return 1; + } + } + + return 0; +} + +static int fixup_vma_fds(int pid, int fd) +{ + int offset = + sizeof(struct binfmt_img_header) + + sizeof(struct binfmt_regs_image) + + sizeof(struct binfmt_mm_image); + + printf("Seek for: %li bytes\n", offset); + lseek(fd, offset, SEEK_SET); + + while (1) { + struct binfmt_vma_image vi; + + if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) { + perror("Can't read"); + return 1; + } + + if (vi.start == 0 && vi.end == 0) + return 0; + + printf("%d: Fixing %016lx-%016lx %016lx vma\n", pid, vi.start, vi.end, vi.pgoff); + if (try_fixup_file_map(pid, &vi, fd)) + return 1; + + if (try_fixup_shared_map(pid, &vi, fd)) + return 1; + } +} + +static inline int should_restore_page(int pid, unsigned long vaddr) +{ + struct shmem_info *si; + unsigned long id; + + id = find_shmem_id(vaddr); + if (id == 0) + return 1; + + si = search_shmem(vaddr, id); + return si->pid == pid; +} + +static int fixup_pages_data(int pid, int fd) +{ + char path[128]; + int shfd; + __u32 mag; + __u64 vaddr; + + sprintf(path, "pages-%d.img", pid); + shfd = open(path, O_RDONLY); + if (shfd < 0) { + perror("Can't open shmem image"); + return 1; + } + + read(shfd, &mag, sizeof(mag)); + if (mag != PAGES_MAGIC) { + fprintf(stderr, "Bad shmem image\n"); + return 1; + } + + lseek(fd, -sizeof(struct binfmt_page_image), SEEK_END); + read(fd, &vaddr, sizeof(vaddr)); + if (vaddr != 0) { + printf("SHIT %lx\n", (unsigned long)vaddr); + return 1; + } + lseek(fd, -sizeof(struct binfmt_page_image), SEEK_END); + + while (1) { + int ret; + + ret = read(shfd, &vaddr, sizeof(vaddr)); + if (ret == 0) + break; + + if (ret < 0 || ret != sizeof(vaddr)) { + perror("Can't read vaddr"); + return 1; + } + + if (vaddr == 0) + break; + + if (!should_restore_page(pid, vaddr)) { + lseek(shfd, PAGE_SIZE, SEEK_CUR); + continue; + } + +// printf("Copy page %lx to image\n", (unsigned long)vaddr); + write(fd, &vaddr, sizeof(vaddr)); + sendfile(fd, shfd, NULL, PAGE_SIZE); + } + + close(shfd); + vaddr = 0; + write(fd, &vaddr, sizeof(vaddr)); + return 0; +} + +static int prepare_image_maps(int fd, int pid) +{ + printf("%d: Fixing maps before executing image\n", pid); + + if (fixup_vma_fds(pid, fd)) + return 1; + + if (fixup_pages_data(pid, fd)) + return 1; + + close(fd); + return 0; +} + +static int execute_image(int pid) +{ + char path[128]; + int fd, fd_new; + struct stat buf; + + sprintf(path, "core-%d.img", pid); + fd = open(path, O_RDONLY); + if (fd < 0) { + perror("Can't open exec image"); + return 1; + } + + if (fstat(fd, &buf)) { + perror("Can't stat"); + return 1; + } + + sprintf(path, "core-%d.img.out", pid); + fd_new = open(path, O_RDWR | O_CREAT | O_EXCL, 0700); + if (fd_new < 0) { + perror("Can't open new image"); + return 1; + } + + printf("%d: Preparing execution image (%li bytes)\n", pid, buf.st_size); + sendfile(fd_new, fd, NULL, buf.st_size); + close(fd); + + if (fchmod(fd_new, 0700)) { + perror("Can't prepare exec image"); + return 1; + } + + if (prepare_image_maps(fd_new, pid)) + return 1; + + sync(); + + printf("%d/%d EXEC IMAGE\n", pid, getpid()); + return execl(path, path, NULL); +} + +static int create_pipe(int pid, struct pipes_entry *e, struct pipes_info *pi, int pipes_fd) +{ + int pfd[2], tmp; + unsigned long time = 1000; + + printf("\t%d: Creating pipe %x\n", pid, e->pipeid); + + if (pipe(pfd) < 0) { + perror("Can't create pipe"); + return 1; + } + + if (e->bytes) { + printf("\t%d: Splicing data to %d\n", pid, pfd[1]); + + tmp = splice(pipes_fd, NULL, pfd[1], NULL, e->bytes, 0); + if (tmp != e->bytes) { + fprintf(stderr, "Wanted to restore %ld bytes, but got %ld\n", + e->bytes, tmp); + if (tmp < 0) + perror("Error splicing data"); + return 1; + } + } + + pi->read_fd = pfd[0]; + pi->write_fd = pfd[1]; + pi->real_pid = getpid(); + + printf("\t%d: Done, waiting for others on %d pid with r:%d w:%d\n", + pid, pi->real_pid, pfd[0], pfd[1]); + + while (1) { + if (pi->users == 1) /* only I left */ + break; + + printf("\t%d: Waiting for %x pipe to attach (%d users left)\n", + pid, e->pipeid, pi->users - 1); + if (time < 20000000) + time <<= 1; + usleep(time); + } + + printf("\t%d: All is ok - reopening pipe for %d\n", pid, e->fd); + if (e->flags & O_WRONLY) { + close(pfd[0]); + tmp = reopen_fd(pfd[1], e->fd); + } else { + close(pfd[1]); + tmp = reopen_fd(pfd[0], e->fd); + } + + if (tmp < 0) { + perror("Can't dup pipe fd"); + return 1; + } + + return 0; +} + +static int attach_pipe(int pid, struct pipes_entry *e, struct pipes_info *pi) +{ + char path[128]; + int tmp, fd; + + printf("\t%d: Wating for pipe %x to appear\n", pid, e->pipeid); + + while (pi->real_pid == 0) + usleep(1000); + + if (e->flags & O_WRONLY) + tmp = pi->write_fd; + else + tmp = pi->read_fd; + + sprintf(path, "/proc/%d/fd/%d", pi->real_pid, tmp); + printf("\t%d: Attaching pipe %s\n", pid, path); + + fd = open(path, e->flags); + if (fd < 0) { + perror("Can't attach pipe"); + return 1; + } + + printf("\t%d: Done, reopening for %d\n", pid, e->fd); + pi->users--; + tmp = reopen_fd(fd, e->fd); + if (tmp < 0) { + perror("Can't dup to attach pipe"); + return 1; + } + + return 0; + +} + +static int open_pipe(int pid, struct pipes_entry *e, int *pipes_fd) +{ + struct pipes_info *pi; + + printf("\t%d: Opening pipe %x on fd %d\n", pid, e->pipeid, e->fd); + if (e->fd == *pipes_fd) { + int tmp; + + tmp = dup(*pipes_fd); + if (tmp < 0) { + perror("Can't dup file"); + return 1; + } + + *pipes_fd = tmp; + } + + pi = search_pipes(e->pipeid); + if (pi == NULL) { + fprintf(stderr, "BUG: can't find my pipe %x\n", e->pipeid); + return 1; + } + + if (pi->pid == pid) + return create_pipe(pid, e, pi, *pipes_fd); + else + return attach_pipe(pid, e, pi); +} + +static int prepare_pipes(int pid) +{ + char path[64]; + int pipes_fd; + __u32 type = 0; + + printf("%d: Opening pipes\n", pid); + + sprintf(path, "pipes-%d.img", pid); + pipes_fd = open(path, O_RDONLY); + if (pipes_fd < 0) { + perror("Can't open pipes img"); + return 1; + } + + read(pipes_fd, &type, sizeof(type)); + if (type != PIPES_MAGIC) { + perror("Bad pipes file"); + return 1; + } + + while (1) { + struct pipes_entry e; + int ret; + + ret = read(pipes_fd, &e, sizeof(e)); + if (ret == 0) { + close(pipes_fd); + return 0; + } + if (ret != sizeof(e)) { + perror("Bad pipes entry"); + return 1; + } + + if (open_pipe(pid, &e, &pipes_fd)) + return 1; + } +} + +static int restore_one_task(int pid) +{ + printf("%d: Restoring resources\n", pid); + + if (prepare_pipes(pid)) + return 1; + + if (prepare_fds(pid)) + return 1; + + if (prepare_shmem(pid)) + return 1; + + return execute_image(pid); +} + +static int do_child(void *arg) +{ + return restore_task_with_children(getpid(), arg); +} + +static inline int fork_with_pid(int pid, char *pstree_path) +{ + int ret = 0; + void *stack; + + stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0); + if (stack == MAP_FAILED) + return -1; + + stack += 4 * 4096; + ret = clone(do_child, stack, SIGCHLD | CLONE_CHILD_USEPID, pstree_path, NULL, NULL, &pid); + + return ret; +} + +static int restore_task_with_children(int my_pid, char *pstree_path) +{ + int *pids; + int fd, ret, i; + struct pstree_entry e; + + printf("%d: Starting restore\n", my_pid); + + fd = open(pstree_path, O_RDONLY); + if (fd < 0) { + perror("Can't reopen pstree image"); + exit(1); + } + + lseek(fd, sizeof(__u32), SEEK_SET); + while (1) { + ret = read(fd, &e, sizeof(e)); + if (ret != sizeof(e)) { + fprintf(stderr, "%d: Read returned %d\n", my_pid, ret); + if (ret < 0) + perror("Can't read pstree"); + exit(1); + } + + if (e.pid != my_pid) { + lseek(fd, e.nr_children * sizeof(__u32), SEEK_CUR); + continue; + } + + break; + } + + if (e.nr_children > 0) { + i = e.nr_children * sizeof(int); + pids = malloc(i); + ret = read(fd, pids, i); + if (ret != i) { + perror("Can't read children pids"); + exit(1); + } + + close(fd); + + printf("%d: Restoring %d children:\n", my_pid, e.nr_children); + for (i = 0; i < e.nr_children; i++) { + printf("\tFork %d from %d\n", pids[i], my_pid); + ret = fork_with_pid(pids[i], pstree_path); + if (ret < 0) { + perror("Can't fork kid"); + exit(1); + } + } + } else + close(fd); + + shmem_update_real_pid(my_pid, getpid()); + + return restore_one_task(my_pid); +} + +static int restore_root_task(char *pstree_path, int fd) +{ + struct pstree_entry e; + int ret; + + ret = read(fd, &e, sizeof(e)); + if (ret != sizeof(e)) { + perror("Can't read root pstree entry"); + return 1; + } + + close(fd); + + printf("Forking root with %d pid\n", e.pid); + ret = fork_with_pid(e.pid, pstree_path); + if (ret < 0) { + perror("Can't fork root"); + return 1; + } + + wait(NULL); + return 0; +} + +static int restore_all_tasks(char *pid) +{ + char path[128]; + int pstree_fd; + __u32 type = 0; + + sprintf(path, "pstree-%s.img", pid); + pstree_fd = open(path, O_RDONLY); + if (pstree_fd < 0) { + perror("Can't open pstree image"); + return 1; + } + + read(pstree_fd, &type, sizeof(type)); + if (type != PSTREE_MAGIC) { + perror("Bad pstree magic"); + return 1; + } + + if (prepare_shared(pstree_fd)) + return 1; + + return restore_root_task(path, pstree_fd); +} + +int main(int argc, char **argv) +{ + if (argc != 3) + goto usage; + if (argv[1][0] != '-') + goto usage; + if (argv[1][1] == 'p') + return restore_one_task(atoi(argv[2])); + if (argv[1][1] == 't') + return restore_all_tasks(argv[2]); + +usage: + printf("Usage: %s (-t|-p) <pid>\n", argv[0]); + return 1; +} diff --git a/xemul/img-show.c b/xemul/img-show.c new file mode 100644 index 000000000..4d1ad22f8 --- /dev/null +++ b/xemul/img-show.c @@ -0,0 +1,354 @@ +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdlib.h> +#include <linux/types.h> +#include <string.h> +#include "img_structs.h" +#include "binfmt_img.h" + +static int show_fdinfo(int fd) +{ + char data[1024]; + struct fdinfo_entry e; + + while (1) { + int ret; + + ret = read(fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + perror("Can't read"); + return 1; + } + + ret = read(fd, data, e.len); + if (ret != e.len) { + perror("Can't read"); + return 1; + } + + data[e.len] = '\0'; + switch (e.type) { + case FDINFO_FD: + printf("fd %d [%s] pos %lx flags %o\n", (int)e.addr, data, e.pos, e.flags); + break; + case FDINFO_MAP: + printf("map %lx [%s] flags %o\n", e.addr, data, e.flags); + break; + default: + fprintf(stderr, "Unknown fdinfo entry type %d\n", e.type); + return 1; + } + } + + return 0; +} + +#define PAGE_SIZE 4096 + +static int show_mem(int fd) +{ + __u64 vaddr; + unsigned int data[2]; + + while (1) { + if (read(fd, &vaddr, 8) == 0) + break; + if (vaddr == 0) + break; + + read(fd, &data[0], sizeof(unsigned int)); + lseek(fd, PAGE_SIZE - 2 * sizeof(unsigned int), SEEK_CUR); + read(fd, &data[1], sizeof(unsigned int)); + + printf("\tpage 0x%lx [%x...%x]\n", (unsigned long)vaddr, data[0], data[1]); + } + + return 0; +} + +static int show_pages(int fd) +{ + return show_mem(fd); +} + +static int show_shmem(int fd) +{ + int r; + struct shmem_entry e; + + while (1) { + r = read(fd, &e, sizeof(e)); + if (r == 0) + return 0; + if (r != sizeof(e)) { + perror("Can't read shmem entry"); + return 1; + } + + printf("%016lx-%016lx %016x\n", e.start, e.end, e.shmid); + } +} + +static char *segval(__u16 seg) +{ + switch (seg) { + case CKPT_X86_SEG_NULL: return "nul"; + case CKPT_X86_SEG_USER32_CS: return "cs32"; + case CKPT_X86_SEG_USER32_DS: return "ds32"; + case CKPT_X86_SEG_USER64_CS: return "cs64"; + case CKPT_X86_SEG_USER64_DS: return "ds64"; + } + + if (seg & CKPT_X86_SEG_TLS) + return "tls"; + if (seg & CKPT_X86_SEG_LDT) + return "ldt"; + + return "[unknown]"; +} + +static int show_regs(int fd) +{ + struct binfmt_regs_image ri; + + if (read(fd, &ri, sizeof(ri)) != sizeof(ri)) { + perror("Can't read registers from image"); + return 1; + } + + printf("Registers:\n"); + + printf("\tr15: %016lx\n", ri.r.r15); + printf("\tr14: %016lx\n", ri.r.r14); + printf("\tr13: %016lx\n", ri.r.r13); + printf("\tr12: %016lx\n", ri.r.r12); + printf("\tr11: %016lx\n", ri.r.r11); + printf("\tr10: %016lx\n", ri.r.r10); + printf("\tr9: %016lx\n", ri.r.r9); + printf("\tr8: %016lx\n", ri.r.r8); + printf("\tax: %016lx\n", ri.r.ax); + printf("\torig_ax: %016lx\n", ri.r.orig_ax); + printf("\tbx: %016lx\n", ri.r.bx); + printf("\tcx: %016lx\n", ri.r.cx); + printf("\tdx: %016lx\n", ri.r.dx); + printf("\tsi: %016lx\n", ri.r.si); + printf("\tdi: %016lx\n", ri.r.di); + printf("\tip: %016lx\n", ri.r.ip); + printf("\tflags: %016lx\n", ri.r.flags); + printf("\tbp: %016lx\n", ri.r.bp); + printf("\tsp: %016lx\n", ri.r.sp); + printf("\tgs: %016lx\n", ri.r.gs); + printf("\tfs: %016lx\n", ri.r.fs); + printf("\tgsindex: %s\n", segval(ri.r.gsindex)); + printf("\tfsindex: %s\n", segval(ri.r.fsindex)); + printf("\tcs: %s\n", segval(ri.r.cs)); + printf("\tss: %s\n", segval(ri.r.ss)); + printf("\tds: %s\n", segval(ri.r.ds)); + printf("\tes: %s\n", segval(ri.r.es)); + + printf("\ttls0 %016lx\n", ri.r.tls[0]); + printf("\ttls1 %016lx\n", ri.r.tls[1]); + printf("\ttls2 %016lx\n", ri.r.tls[2]); + + return 0; +} + +static int show_mm(int fd, unsigned long *stack) +{ + struct binfmt_mm_image mi; + + if (read(fd, &mi, sizeof(mi)) != sizeof(mi)) { + perror("Can't read mm from image"); + return 1; + } + + printf("MM:\n"); + printf("\tflags: %016lx\n", mi.flags); + printf("\tdef_flags: %016lx\n", mi.def_flags); + printf("\tstart_code: %016lx\n", mi.start_code); + printf("\tend_code: %016lx\n", mi.end_code); + printf("\tstart_data: %016lx\n", mi.start_data); + printf("\tend_data: %016lx\n", mi.end_data); + printf("\tstart_brk: %016lx\n", mi.start_brk); + printf("\tbrk: %016lx\n", mi.brk); + printf("\tstart_stack: %016lx\n", mi.start_stack); + printf("\targ_start: %016lx\n", mi.arg_start); + printf("\targ_end: %016lx\n", mi.arg_end); + printf("\tenv_start: %016lx\n", mi.env_start); + printf("\tenv_end: %016lx\n", mi.env_end); + + *stack = mi.start_stack; + + return 0; +} + +static int show_vmas(int fd, unsigned long stack) +{ + struct binfmt_vma_image vi; + + printf("VMAs:\n"); + while (1) { + char *note = ""; + + if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) { + perror("Can't read vma from image"); + return 1; + } + + if (vi.start == 0 && vi.end == 0) + return 0; + + if (vi.start <= stack && vi.end >= stack) + note = "[stack]"; + + printf("\t%016lx-%016lx file %d %016lx prot %x flags %x %s\n", + vi.start, vi.end, vi.fd, vi.pgoff, + vi.prot, vi.flags, note); + } +} + +static int show_privmem(int fd) +{ + printf("Pages:\n"); + return show_mem(fd); +} + +static int show_core(int fd) +{ + __u32 version = 0; + unsigned long stack; + + read(fd, &version, 4); + if (version != BINFMT_IMG_VERS_0) { + printf("Unsupported version %d\n", version); + return 1; + } + + /* the pad */ + read(fd, &version, 4); + + printf("Showing version 0\n"); + + if (show_regs(fd)) + return 1; + + if (show_mm(fd, &stack)) + return 1; + + if (show_vmas(fd, stack)) + return 1; + + if (show_privmem(fd)) + return 1; + + return 0; +} + +static int show_pstree(int fd) +{ + int ret; + struct pstree_entry e; + + while (1) { + int i; + __u32 *ch; + + ret = read(fd, &e, sizeof(e)); + if (ret == 0) + return 0; + if (ret != sizeof(e)) { + perror("Can't read processes entry"); + return 1; + } + + printf("%d:", e.pid); + i = e.nr_children * sizeof(__u32); + ch = malloc(i); + ret = read(fd, ch, i); + if (ret != i) { + perror("Can't read children list"); + return 1; + } + + for (i = 0; i < e.nr_children; i++) + printf(" %d", ch[i]); + printf("\n"); + } +} + +static int show_pipes(int fd) +{ + struct pipes_entry e; + int ret; + char buf[17]; + + while (1) { + ret = read(fd, &e, sizeof(e)); + if (ret == 0) + break; + if (ret != sizeof(e)) { + perror("Can't read pipe entry"); + return 1; + } + + printf("%d: %lx %o %d ", e.fd, e.pipeid, e.flags, e.bytes); + if (e.flags & O_WRONLY) { + printf("\n"); + + if (e.bytes) { + printf("Bogus pipe\n"); + return 1; + } + + continue; + } + + memset(buf, 0, sizeof(buf)); + ret = e.bytes; + if (ret > 16) + ret = 16; + + read(fd, buf, ret); + printf("\t[%s", buf); + if (ret < e.bytes) + printf("..."); + printf("]\n"); + lseek(fd, e.bytes - ret, SEEK_CUR); + } + + return 0; + +} + +int main(int argc, char **argv) +{ + __u32 type; + int fd; + + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + perror("Can't open"); + return 1; + } + + read(fd, &type, 4); + + if (type == FDINFO_MAGIC) + return show_fdinfo(fd); + if (type == PAGES_MAGIC) + return show_pages(fd); + if (type == SHMEM_MAGIC) + return show_shmem(fd); + if (type == PSTREE_MAGIC) + return show_pstree(fd); + if (type == PIPES_MAGIC) + return show_pipes(fd); + if (type == BINFMT_IMG_MAGIC) + return show_core(fd); + + printf("Unknown file type 0x%x\n", type); + return 1; +} diff --git a/xemul/img_structs.h b/xemul/img_structs.h new file mode 100644 index 000000000..9e52d5da6 --- /dev/null +++ b/xemul/img_structs.h @@ -0,0 +1,39 @@ + +#define FDINFO_MAGIC 0x01010101 + +struct fdinfo_entry { + __u8 type; + __u8 len; + __u16 flags; + __u32 pos; + __u64 addr; +}; + +#define FDINFO_FD 1 +#define FDINFO_MAP 2 + +#define PAGES_MAGIC 0x20202020 + +#define SHMEM_MAGIC 0x03300330 + +struct shmem_entry { + __u64 start; + __u64 end; + __u64 shmid; +}; + +#define PSTREE_MAGIC 0x40044004 + +struct pstree_entry { + __u32 pid; + __u32 nr_children; +}; + +#define PIPES_MAGIC 0x05055050 + +struct pipes_entry { + __u32 fd; + __u32 pipeid; + __u32 flags; + __u32 bytes; +}; diff --git a/xemul/readme b/xemul/readme new file mode 100644 index 000000000..7a7c1c3e1 --- /dev/null +++ b/xemul/readme @@ -0,0 +1,2 @@ +Previous version of C/R -- uses in-kernel dumper restorer. +It's here for the reference and not used by crtools itself. |