Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCyrill Gorcunov <gorcunov@gmail.com>2011-09-23 12:00:45 +0400
committerCyrill Gorcunov <gorcunov@gmail.com>2011-09-23 12:00:45 +0400
commit523de236244946a0de127dfc9954369963819ef7 (patch)
treeb6001e027216b31c278d2ab15ef72ce7d58c3c9a
Initial commitinit
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
-rw-r--r--.gitignore9
-rw-r--r--Makefile171
-rw-r--r--README15
-rw-r--r--cr-dump.c977
-rw-r--r--cr-restore.c1144
-rw-r--r--cr-show.c389
-rw-r--r--crtools.c280
-rw-r--r--elf.c213
-rw-r--r--gen-offsets.sh22
-rw-r--r--include/bitops.h54
-rw-r--r--include/compiler.h57
-rw-r--r--include/crtools.h105
-rw-r--r--include/elf.h507
-rw-r--r--include/image.h191
-rw-r--r--include/list.h286
-rw-r--r--include/parasite-syscall.h46
-rw-r--r--include/parasite.h68
-rw-r--r--include/rbtree.h79
-rw-r--r--include/syscall.h181
-rw-r--r--include/types.h132
-rw-r--r--include/util.h178
-rw-r--r--kernel/binfmt-elf-for-cr-4636
-rw-r--r--kernel/cr-clone-with-pid-support172
-rw-r--r--kernel/cr-proc-add-children46
-rw-r--r--kernel/cr-proc-map-files-21522
-rw-r--r--kernel/cr-statfs-callback-for-pipefs27
-rw-r--r--kernel/fs-add-do-close86
-rw-r--r--kernel/fs-proc-add-tls45
-rw-r--r--kernel/fs-proc-switch-to-dentry108
-rw-r--r--kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch28
-rw-r--r--kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch255
-rw-r--r--kernel/proc-force-dcache-drop-on-unauthorized-access.patch118
-rw-r--r--kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch26
-rw-r--r--kernel/readme5
-rw-r--r--kernel/series12
-rw-r--r--parasite-elf.lds.S19
-rw-r--r--parasite-syscall.c514
-rw-r--r--parasite.c339
-rw-r--r--parasite.lds.S19
-rw-r--r--rbtree.c322
-rw-r--r--testee-static.c112
-rw-r--r--testee-threads.c74
-rw-r--r--testee-unlinked.c92
-rw-r--r--testee.c231
-rw-r--r--util.c412
-rw-r--r--xemul/0003-Image-dumping-via-proc-file.patch562
-rw-r--r--xemul/0004-Images-execution-binfmt-handler.patch371
-rw-r--r--xemul/binfmt_img.h96
-rw-r--r--xemul/cr-dump.c781
-rw-r--r--xemul/cr-restore.c1115
-rw-r--r--xemul/img-show.c354
-rw-r--r--xemul/img_structs.h39
-rw-r--r--xemul/readme2
53 files changed, 12644 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..1a537e27d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+*.o
+*.d
+*.img
+*.bin
+*.elf
+*.out
+cscope*
+tags
+TAGS
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..8972f07d7
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,171 @@
+ifeq ($(strip $(V)),)
+ E = @echo
+ Q = @
+else
+ E = @\#
+ Q =
+endif
+export E Q
+
+FIND := find
+CSCOPE := cscope
+TAGS := ctags
+RM := rm
+LD := ld
+HEXDUMP := hexdump
+CC := gcc
+ECHO := echo
+NM := nm
+AWK := awk
+SH := sh
+
+CFLAGS += -I./include
+CFLAGS += -O0 -ggdb3
+
+LIBS += -lrt
+
+# Additional ARCH settings for x86
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
+ -e s/arm.*/arm/ -e s/sa110/arm/ \
+ -e s/s390x/s390/ -e s/parisc64/parisc/ \
+ -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+ -e s/sh[234].*/sh/ )
+
+uname_M := $(shell uname -m | sed -e s/i.86/i386/)
+ifeq ($(uname_M),i386)
+ ARCH := x86
+ DEFINES += -DCONFIG_X86_32
+endif
+ifeq ($(uname_M),x86_64)
+ ARCH := x86
+ DEFINES += -DCONFIG_X86_64
+endif
+
+DEFINES += -D_FILE_OFFSET_BITS=64
+DEFINES += -D_GNU_SOURCE
+
+ifneq ($(WERROR),0)
+ WARNINGS += -Werror
+endif
+
+WARNINGS += -Wall -Wno-unused
+CFLAGS += $(WARNINGS) $(DEFINES)
+
+PROGRAM := crtools
+TESTEE := testee
+TESTEE-TH := testee-threads
+TESTEE-STATIC := testee-static
+
+all: $(PROGRAM) $(TESTEE) $(TESTEE-TH) $(TESTEE-STATIC)
+
+OBJS += crtools.o
+OBJS += parasite-syscall.o
+OBJS += cr-dump.o
+OBJS += cr-restore.o
+OBJS += cr-show.o
+OBJS += util.o
+OBJS += rbtree.o
+OBJS += elf.o
+
+OBJS-TESTEE += testee.o
+
+OBJS-TESTEE-TH += testee-threads.o
+
+OBJS-BLOB += parasite.o
+
+DEPS := $(patsubst %.o,%.d,$(OBJS))
+DEPS-TESTEE := $(patsubst %.o,%.d,$(OBJS-TESTEE))
+DEPS-TESTEE-TH := $(patsubst %.o,%.d,$(OBJS-TESTEE-TH))
+DEPS-BLOB := $(patsubst %.o,%.d,$(OBJS-BLOB))
+
+SRCS-BLOB += $(patsubst %.o,%.c,$(OBJS-BLOB))
+
+HEAD-BLOB := $(patsubst %.o,%.h,$(OBJS-BLOB))
+HEAD-BLOB-GEN := $(patsubst %.o,%-blob.h,$(OBJS-BLOB))
+HEAD-BIN := $(patsubst %.o,%.bin,$(OBJS-BLOB))
+HEAD-LDS := $(patsubst %.o,%.lds.S,$(OBJS-BLOB))
+
+HEAD-IDS := $(patsubst %.h,%_h__,$(subst -,_,$(HEAD-BLOB)))
+
+$(OBJS-BLOB): $(SRCS-BLOB) $(DEPS-BLOB)
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) -fpic $< -o $@
+
+$(HEAD-BIN): $(OBJS-BLOB) $(HEAD-LDS)
+%.bin: %.o
+ $(E) " GEN " $@
+ $(Q) $(LD) -T $(patsubst %.bin,%.lds.S,$@) $< -o $@
+ $(Q) $(LD) -T $(patsubst %.bin,%-elf.lds.S,$@) $< -o $@.o
+
+$(HEAD-BLOB): $(DEPS-BLOB) $(HEAD-BIN)
+%-blob.h: %.bin
+%.h: %.bin
+ $(E) " GEN " $@
+ $(Q) $(SH) gen-offsets.sh \
+ $(subst -,_,$(patsubst %.h,%,$@))_h__ \
+ $(subst -,_,$(patsubst %.h,%,$@))_blob_offset__ \
+ $(subst -,_,$(patsubst %.h,%,$@))_blob \
+ $(patsubst %.h,%.o,$@) \
+ $(patsubst %.h,%.bin,$@) > $(patsubst %.h,%-blob.h,$@)
+
+$(OBJS): $(HEAD-BLOB) $(DEPS)
+$(OBJS-TESTEE): $(DEPS-TESTEE)
+$(OBJS-TESTEE-TH): $(DEPS-TESTEE-TH)
+%.o: %.c
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $< -o $@
+
+$(PROGRAM): $(OBJS)
+ $(E) " LINK " $@
+ $(Q) $(CC) $(OBJS) $(LIBS) -o $@
+
+$(TESTEE): $(OBJS-TESTEE)
+ $(E) " LINK " $@
+ $(Q) $(CC) $(OBJS-TESTEE) -o $@
+
+$(TESTEE-TH): $(OBJS-TESTEE-TH)
+ $(E) " LINK " $@
+ $(Q) $(CC) $(OBJS-TESTEE-TH) -lpthread -o $@
+
+$(TESTEE-STATIC).o: testee-static.c
+ $(Q) gcc -c -static -I./.include -o testee-static.o testee-static.c
+
+$(TESTEE-STATIC): $(TESTEE-STATIC).o
+ $(Q) gcc -o testee-static -static testee-static.o
+
+$(DEPS):
+$(DEPS-TESTEE):
+$(DEPS-TESTEE-TH):
+$(DEPS-BLOB):
+%.d: %.c
+ $(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
+
+clean:
+ $(E) " CLEAN"
+ $(Q) rm -f ./*.o
+ $(Q) rm -f ./*.d
+ $(Q) rm -f ./*.img
+ $(Q) rm -f ./*.elf
+ $(Q) rm -f ./*.out
+ $(Q) rm -f ./*.bin
+ $(Q) rm -f ./tags
+ $(Q) rm -f ./cscope*
+ $(Q) rm -f ./$(PROGRAM)
+ $(Q) rm -f ./$(TESTEE)
+ $(Q) rm -f ./$(TESTEE-STATIC)
+ $(Q) rm -f ./$(TESTEE-TH)
+ $(Q) rm -f ./$(HEAD-BLOB)
+ $(Q) rm -f ./$(HEAD-BLOB-GEN)
+.PHONY: clean
+
+tags:
+ $(E) " GEN" $@
+ $(Q) $(RM) -f tags
+ $(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+.PHONY: tags
+
+cscope:
+ $(E) " GEN" $@
+ $(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
+ $(Q) $(CSCOPE) -bkqu
+.PHONY: cscope
diff --git a/README b/README
new file mode 100644
index 000000000..ab7717722
--- /dev/null
+++ b/README
@@ -0,0 +1,15 @@
+crtools
+=======
+
+An utility to to checkpoint/restore tasks.
+
+Some code is borrowed from
+
+ - Linux kernel (http://kernel.org/)
+ - git (http://git-scm.com/)
+ - kvm-tools (https://github.com/penberg/linux-kvm.git)
+ - ptrace-parasite (https://code.google.com/p/ptrace-parasite/)
+
+Many thanks to these projects.
+
+Licensed under GPLv2
diff --git a/cr-dump.c b/cr-dump.c
new file mode 100644
index 000000000..57036d105
--- /dev/null
+++ b/cr-dump.c
@@ -0,0 +1,977 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include <sys/sendfile.h>
+
+#include "types.h"
+#include "list.h"
+
+#include "compiler.h"
+#include "crtools.h"
+#include "syscall.h"
+#include "util.h"
+
+#include "image.h"
+
+#include "parasite.h"
+#include "parasite-syscall.h"
+#include "parasite-blob.h"
+
+#ifndef CONFIG_X86_64
+# error No x86-32 support yet
+#endif
+
+static LIST_HEAD(vma_area_list);
+static LIST_HEAD(pstree_list);
+
+static char big_buffer[PATH_MAX];
+static struct parasite_ctl *parasite_ctl;
+
+static char loc_buf[PAGE_SIZE];
+
+static void free_pstree(void)
+{
+ struct pstree_item *item, *p;
+
+ list_for_each_entry_safe(item, p, &pstree_list, list) {
+ xfree(item->children);
+ xfree(item);
+ }
+
+ INIT_LIST_HEAD(&pstree_list);
+}
+
+static void free_mappings(void)
+{
+ struct vma_area *vma_area, *p;
+
+ list_for_each_entry_safe(vma_area, p, &vma_area_list, list) {
+ if (vma_area->vm_file_fd > 0)
+ close(vma_area->vm_file_fd);
+ free(vma_area);
+ }
+
+ INIT_LIST_HEAD(&vma_area_list);
+}
+
+static int collect_mappings(pid_t pid)
+{
+ struct vma_area *vma_area;
+ int ret = -1;
+
+ pr_info("\n");
+ pr_info("Collecting mappings (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ ret = parse_maps(pid, &vma_area_list);
+ if (ret)
+ goto err;
+
+ pr_info_vma_list(&vma_area_list);
+
+ pr_info("----------------------------------------\n");
+
+err:
+ return ret;
+
+err_bogus_mapping:
+ pr_error("Bogus mapping %lx-%lx\n",
+ vma_area->vma.start,
+ vma_area->vma.end);
+ goto err;
+}
+
+static int dump_one_reg_file(int type, unsigned long fd_name, int lfd,
+ bool do_close, unsigned long pos, unsigned int flags,
+ struct cr_fdset *cr_fdset)
+{
+ struct fdinfo_entry e;
+ char fd_str[128];
+ int len;
+ int ret = -1;
+
+ snprintf(fd_str, sizeof(fd_str), "/proc/self/fd/%d", lfd);
+ len = readlink(fd_str, big_buffer, sizeof(big_buffer) - 1);
+ if (len < 0) {
+ pr_perror("Can't readlink %s\n", fd_str);
+ goto err;
+ }
+
+ big_buffer[len] = '\0';
+ pr_info("Dumping path for %lx fd via self %d [%s]\n",
+ fd_name, lfd, big_buffer);
+
+ if (do_close)
+ close(lfd);
+
+ e.type = type;
+ e.len = len;
+ e.flags = flags;
+ e.pos = pos;
+ e.addr = fd_name;
+
+ pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8x addr: %16lx\n",
+ type, len, flags, pos, fd_name);
+
+ write_ptr_safe(cr_fdset->desc[CR_FD_FDINFO].fd, &e, err);
+ write_safe(cr_fdset->desc[CR_FD_FDINFO].fd, big_buffer, e.len, err);
+
+ ret = 0;
+err:
+ return ret;
+}
+
+static int dump_pipe_and_data(int lfd, struct pipe_entry *e,
+ struct cr_fdset *cr_fdset)
+{
+ int fd_pipes;
+ int steal_pipe[2];
+ int pipe_size;
+ int has_bytes;
+ int ret = -1;
+
+ fd_pipes = cr_fdset->desc[CR_FD_PIPES].fd;
+
+ pr_info("Dumping data from pipe %x\n", e->pipeid);
+ if (pipe(steal_pipe) < 0) {
+ pr_perror("Can't create pipe for stealing data\n");
+ goto err;
+ }
+
+ pipe_size = fcntl(lfd, F_GETPIPE_SZ);
+ if (pipe_size < 0) {
+ pr_error("Can't obtain piped data size\n");
+ goto err;
+ }
+
+ has_bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK);
+ if (has_bytes < 0) {
+ if (errno != EAGAIN) {
+ pr_perror("Can't pick pipe data\n");
+ goto err_close;
+ } else
+ has_bytes = 0;
+ }
+
+ e->bytes = has_bytes;
+ write_ptr_safe(fd_pipes, e, err_close);
+
+ if (has_bytes) {
+ ret = splice(steal_pipe[0], NULL, fd_pipes,
+ NULL, has_bytes, 0);
+ if (ret < 0) {
+ pr_perror("Can't push pipe data\n");
+ goto err_close;
+ }
+ }
+
+ ret = 0;
+
+err_close:
+ close(steal_pipe[0]);
+ close(steal_pipe[1]);
+
+err:
+ return ret;
+}
+
+static int dump_one_pipe(int fd, int lfd, unsigned int id, unsigned int flags,
+ struct cr_fdset *cr_fdset)
+{
+ struct pipe_entry e;
+ int ret = -1;
+
+ pr_info("Dumping pipe %d/%x flags %x\n", fd, id, flags);
+
+ e.fd = fd;
+ e.pipeid = id;
+ e.flags = flags;
+
+ if (flags & O_WRONLY) {
+ e.bytes = 0;
+ write_ptr_safe(cr_fdset->desc[CR_FD_PIPES].fd, &e, err);
+ ret = 0;
+ } else
+ ret = dump_pipe_and_data(lfd, &e, cr_fdset);
+
+err:
+ if (!ret)
+ pr_info("Dumped pipe: fd: %8lx pipeid: %8lx flags: %8lx bytes: %8lx\n",
+ e.fd, e.pipeid, e.flags, e.bytes);
+ else
+ pr_error("Dumping pipe %d/%x flags %x\n", fd, id, flags);
+
+ return ret;
+}
+
+static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long pos,
+ unsigned int flags, struct cr_fdset *cr_fdset)
+{
+ struct statfs stfs_buf;
+ struct stat st_buf;
+ int fd;
+
+ fd = openat(dir, fd_name, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Failed to openat %s/%d %s\n", pid_fd_dir, dir, fd_name);
+ return -1;
+ }
+
+ if (fstat(fd, &st_buf) < 0) {
+ pr_perror("Can't get stat on %s\n", fd_name);
+ return -1;
+ }
+
+ if (S_ISREG(st_buf.st_mode))
+ return dump_one_reg_file(FDINFO_FD, atol(fd_name),
+ fd, 1, pos, flags, cr_fdset);
+
+ if (S_ISFIFO(st_buf.st_mode)) {
+ if (fstatfs(fd, &stfs_buf) < 0) {
+ pr_perror("Can't fstatfs on %s\n", fd_name);
+ return -1;
+ }
+
+ if (stfs_buf.f_type == PIPEFS_MAGIC)
+ return dump_one_pipe(atol(fd_name), fd,
+ st_buf.st_ino, flags, cr_fdset);
+ }
+
+ if (!strcmp(fd_name, "0")) {
+ pr_info("... Skipping stdin ...\n");
+ return 0;
+ }
+
+ if (!strcmp(fd_name, "1")) {
+ pr_info("... Skipping stdout ...\n");
+ return 0;
+ }
+
+ if (!strcmp(fd_name, "2")) {
+ pr_info("... Skipping stderr ...\n");
+ return 0;
+ }
+
+ if (!strcmp(fd_name, "3")) {
+ pr_info("... Skipping tty ...\n");
+ return 0;
+ }
+
+ pr_error("Can't dump file %s of that type [%x]\n", fd_name, st_buf.st_mode);
+ return 1;
+}
+
+static int read_fd_params(pid_t pid, char *fd, unsigned long *pos, unsigned int *flags)
+{
+ char fd_str[128];
+ int ifd;
+
+ snprintf(fd_str, sizeof(fd_str), "/proc/%d/fdinfo/%s", pid, fd);
+
+ ifd = open(fd_str, O_RDONLY);
+ if (ifd < 0) {
+ pr_perror("Can't open %s\n", fd_str);
+ return -1;
+ }
+
+ read(ifd, big_buffer, sizeof(big_buffer));
+ close(ifd);
+
+ sscanf(big_buffer, "pos:\t%li\nflags:\t%o\n", pos, flags);
+
+ pr_info("%s: pos: %16lx flags: %16lx\n", fd_str, *pos, *flags);
+
+ return 0;
+}
+
+static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ char pid_fd_dir[64];
+ struct dirent *de;
+ unsigned long pos;
+ unsigned int flags;
+ DIR *fd_dir;
+
+ pr_info("\n");
+ pr_info("Dumping opened files (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ snprintf(pid_fd_dir, sizeof(pid_fd_dir), "/proc/%d/fd", pid);
+ fd_dir = opendir(pid_fd_dir);
+ if (!fd_dir) {
+ pr_perror("Can't open %s\n", pid_fd_dir);
+ return -1;
+ }
+
+ while ((de = readdir(fd_dir))) {
+ if (de->d_name[0] == '.')
+ continue;
+ if (read_fd_params(pid, de->d_name, &pos, &flags))
+ return -1;
+ if (dump_one_fd(pid_fd_dir, dirfd(fd_dir), de->d_name, pos, flags, cr_fdset))
+ return -1;
+ }
+
+ pr_info("----------------------------------------\n");
+
+ closedir(fd_dir);
+ return 0;
+}
+
+static int dump_task_mappings(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ struct vma_area *vma_area;
+ int ret = -1;
+
+ pr_info("\n");
+ pr_info("Dumping mappings (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ list_for_each_entry(vma_area, &vma_area_list, list) {
+
+ struct vma_entry *vma = &vma_area->vma;
+
+ if (!(vma->status & VMA_AREA_REGULAR))
+ continue;
+
+ pr_info_vma(vma_area);
+
+ switch (vma->flags) {
+ case MAP_SHARED:
+ case MAP_PRIVATE:
+
+ if ((vma->status & VMA_ANON_SHARED)) {
+ struct shmem_entry e;
+
+ e.start = vma->start;
+ e.end = vma->end;
+ e.shmid = vma_area->shmid;
+
+ pr_info("shmem: s: %16lx e: %16lx shmid: %16lx\n",
+ e.start, e.end, e.shmid);
+
+ write_ptr_safe(cr_fdset->desc[CR_FD_SHMEM].fd, &e, err);
+ } else if ((vma->status & VMA_FILE_PRIVATE) ||
+ (vma->status & VMA_FILE_SHARED)) {
+
+ unsigned int flags;
+
+ if (vma->prot & PROT_WRITE && (vma->status & VMA_FILE_SHARED))
+ flags = O_RDWR;
+ else
+ flags = O_RDONLY;
+
+ ret = dump_one_reg_file(FDINFO_MAP,
+ vma->start,
+ vma_area->vm_file_fd,
+ 0, 0, flags,
+ cr_fdset);
+ if (ret)
+ goto err;
+ }
+ break;
+ default:
+ pr_panic("Unknown VMA (pid: %d)\n", pid);
+ goto err;
+ break;
+ }
+ }
+
+ ret = 0;
+
+ pr_info("----------------------------------------\n");
+
+err:
+ return ret;
+}
+
+#define assign_reg(dst, src, e) dst.e = (__typeof__(dst.e))src.e
+#define assign_array(dst, src, e) memcpy(&dst.e, &src.e, sizeof(dst.e))
+
+static int get_task_personality(pid_t pid, u32 *personality)
+{
+ FILE *file = NULL;
+ int ret = -1;
+
+ snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/personality", pid);
+ file = fopen(loc_buf, "r");
+ if (!file) {
+ perror("Can't open task personality");
+ goto err;
+ }
+
+ if (!fgets(loc_buf, sizeof(loc_buf), file)) {
+ perror("Can't read task personality");
+ goto err;
+ }
+
+ *personality = atoi(loc_buf);
+ ret = 0;
+
+err:
+ if (file)
+ fclose(file);
+ return ret;
+}
+
+static int dump_task_tls(pid_t pid, struct desc_struct *tls_array, int size)
+{
+ FILE *file = NULL;
+ int ret = -1;
+
+ if (size != GDT_ENTRY_TLS_ENTRIES) {
+ pr_error("Wrong TLS storage size: %d\n", size);
+ goto err;
+ }
+
+ snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/tls", pid);
+ file = fopen(loc_buf, "r");
+ if (!file) {
+ perror("Can't open task tls");
+ goto err;
+ }
+
+ ret = 0;
+ while (fgets(loc_buf, sizeof(loc_buf), file)) {
+ u32 a, b;
+ if (sscanf(loc_buf, "%x %x", &a, &b) != 2) {
+ pr_error("Can't parse tls entry: %s\n");
+ ret = -1;
+ goto err;
+ }
+ if (ret >= GDT_ENTRY_TLS_ENTRIES) {
+ pr_error("Too many entries in tls\n");
+ ret = -1;
+ goto err;
+ }
+ tls_array[ret].a = a;
+ tls_array[ret].b = b;
+
+ ret++;
+ }
+
+ if (ret != GDT_ENTRY_TLS_ENTRIES) {
+ pr_error("tls returened %i entries instead of %i\n",
+ ret, GDT_ENTRY_TLS_ENTRIES);
+ ret = -1;
+ goto err;
+ }
+
+ ret = 0;
+
+err:
+ if (file)
+ fclose(file);
+ return ret;
+}
+
+static int dump_task_core_seized(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ struct core_entry *core = xzalloc(sizeof(*core));
+ user_fpregs_struct_t fpregs = {-1};
+ user_regs_struct_t regs = {-1};
+ int fd_core = cr_fdset->desc[CR_FD_CORE].fd;
+ int ret = -1;
+
+ pr_info("\n");
+ pr_info("Dumping core (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ if (!core)
+ goto err;
+
+ lseek(fd_core, MAGIC_OFFSET, SEEK_SET);
+
+ jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err_free);
+ jerr(ptrace(PTRACE_GETFPREGS, pid, NULL, &fpregs), err_free);
+
+ pr_info("Dumping GP/FPU registers ... ");
+
+ assign_reg(core->gpregs, regs, r15);
+ assign_reg(core->gpregs, regs, r14);
+ assign_reg(core->gpregs, regs, r13);
+ assign_reg(core->gpregs, regs, r12);
+ assign_reg(core->gpregs, regs, bp);
+ assign_reg(core->gpregs, regs, bx);
+ assign_reg(core->gpregs, regs, r11);
+ assign_reg(core->gpregs, regs, r10);
+ assign_reg(core->gpregs, regs, r9);
+ assign_reg(core->gpregs, regs, r8);
+ assign_reg(core->gpregs, regs, ax);
+ assign_reg(core->gpregs, regs, cx);
+ assign_reg(core->gpregs, regs, dx);
+ assign_reg(core->gpregs, regs, si);
+ assign_reg(core->gpregs, regs, di);
+ assign_reg(core->gpregs, regs, orig_ax);
+ assign_reg(core->gpregs, regs, ip);
+ assign_reg(core->gpregs, regs, cs);
+ assign_reg(core->gpregs, regs, flags);
+ assign_reg(core->gpregs, regs, sp);
+ assign_reg(core->gpregs, regs, ss);
+ assign_reg(core->gpregs, regs, fs_base);
+ assign_reg(core->gpregs, regs, gs_base);
+ assign_reg(core->gpregs, regs, ds);
+ assign_reg(core->gpregs, regs, es);
+ assign_reg(core->gpregs, regs, fs);
+ assign_reg(core->gpregs, regs, gs);
+
+ assign_reg(core->fpregs, fpregs, cwd);
+ assign_reg(core->fpregs, fpregs, swd);
+ assign_reg(core->fpregs, fpregs, twd);
+ assign_reg(core->fpregs, fpregs, fop);
+ assign_reg(core->fpregs, fpregs, rip);
+ assign_reg(core->fpregs, fpregs, rdp);
+ assign_reg(core->fpregs, fpregs, mxcsr);
+ assign_reg(core->fpregs, fpregs, mxcsr_mask);
+
+ assign_array(core->fpregs, fpregs, st_space);
+ assign_array(core->fpregs, fpregs, xmm_space);
+ assign_array(core->fpregs, fpregs, padding);
+
+ pr_info("OK\n");
+
+ pr_info("Obtainting TLS ... ");
+ ret = dump_task_tls(pid, core->tls_array, ARRAY_SIZE(core->tls_array));
+ if (ret)
+ goto err_free;
+ pr_info("OK\n");
+
+ pr_info("Obtainting personality ... ");
+ ret = get_task_personality(pid, &core->personality);
+ if (ret)
+ goto err_free;
+ pr_info("OK\n");
+
+ pr_info("Dumping header ... ");
+ core->hdr.version = HEADER_VERSION;
+ core->hdr.arch = HEADER_ARCH_X86_64;
+ core->hdr.flags = 0;
+
+ write_ptr_safe(fd_core, core, err_free);
+
+ pr_info("OK\n");
+ ret = 0;
+
+err_free:
+ free(core);
+err:
+ pr_info("----------------------------------------\n");
+
+ return ret;
+}
+
+static struct pstree_item *find_children(pid_t pid)
+{
+ struct pstree_item *item = NULL;
+ u32 *children = NULL;
+ u32 nr_allocated = 0;
+ u32 nr_children = 0;
+ bool found = false;
+ FILE *file;
+ char *tok;
+
+ pr_debug("pid: %d\n", pid);
+
+ snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/status", pid);
+ file = fopen(loc_buf, "r");
+ if (!file) {
+ perror("Can't open task status");
+ goto err;
+ }
+
+ while ((fgets(loc_buf, sizeof(loc_buf), file))) {
+ if (strncmp(loc_buf, "Children:", 9)) {
+ continue;
+ } else {
+ found = true;
+ break;
+ }
+ }
+
+ fclose(file), file = NULL;
+ if (!found) {
+ pr_error("Children marker is not found\n");
+ goto err;
+ }
+
+ item = xzalloc(sizeof(*item));
+ if (!item)
+ goto err;
+
+ tok = strtok(&loc_buf[10], " \n");
+ while (tok) {
+ u32 child_pid = atoi(tok);
+
+ pr_debug("child_pid: %d\n", child_pid);
+
+ if (nr_allocated <= nr_children) {
+ nr_allocated += 64;
+ if (xrealloc_safe((void **)&children, nr_allocated)) {
+ xfree(children);
+ xfree(item);
+ item = NULL;
+ goto err;
+ }
+ }
+
+ children[nr_children++] = child_pid;
+ tok = strtok(NULL, " \n");
+ }
+
+ item->pid = pid;
+ item->nr_children = nr_children;
+ item->children = children;
+
+err:
+ return item;
+}
+
+static int collect_pstree(pid_t pid)
+{
+ struct pstree_item *item;
+ unsigned long i;
+ int ret = -1;
+
+ item = find_children(pid);
+ if (!item)
+ goto err;
+
+ list_add_tail(&item->list, &pstree_list);
+
+ for (i = 0; i < item->nr_children; i++) {
+ ret = collect_pstree(item->children[i]);
+ if (ret)
+ goto err;
+ }
+ ret = 0;
+
+err:
+ return ret;
+}
+
+static int dump_pstree(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ struct pstree_item *item;
+ struct pstree_entry e;
+ unsigned long i;
+ int ret = -1;
+
+ pr_info("\n");
+ pr_info("Dumping pstree (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ list_for_each_entry(item, &pstree_list, list) {
+
+ pr_info("Process: %d (%d children)\n",
+ item->pid, item->nr_children);
+
+ e.pid = item->pid;
+ e.nr_children = item->nr_children;
+
+ write_ptr_safe(cr_fdset->desc[CR_FD_PSTREE].fd, &e, err);
+
+ pr_info("Children:");
+ for (i = 0; i < item->nr_children; i++) {
+ pr_info(" %d", item->children[i]);
+ write_ptr_safe(cr_fdset->desc[CR_FD_PSTREE].fd,
+ &item->children[i], err);
+ }
+ pr_info("\n");
+ }
+ ret = 0;
+
+err:
+ pr_info("----------------------------------------\n");
+ return ret;
+}
+
+static struct vma_area *find_vma_by_addr(unsigned long addr)
+{
+ struct vma_area *vma_area;
+
+ list_for_each_entry(vma_area, &vma_area_list, list) {
+ if (in_vma_area(vma_area, addr))
+ return vma_area;
+ }
+
+ return NULL;
+}
+
+/* kernel expects a special format in core file */
+static int finalize_core(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ int fd_pages, fd_pages_shmem, fd_core;
+ unsigned long num, num_anon;
+ struct vma_area *vma_area;
+ struct vma_entry ve;
+ int ret = -1;
+ u64 va;
+
+ pr_info("\n");
+ pr_info("Finalizing core (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ fd_core = cr_fdset->desc[CR_FD_CORE].fd;
+ fd_pages = cr_fdset->desc[CR_FD_PAGES].fd;
+ fd_pages_shmem = cr_fdset->desc[CR_FD_PAGES_SHMEM].fd;
+
+ pr_debug("dsc: fd_core %d fd_pages %d fd_pages_shmem %d\n",
+ fd_core, fd_pages, fd_pages_shmem);
+
+ lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
+ lseek(fd_pages, MAGIC_OFFSET, SEEK_SET);
+ lseek(fd_pages_shmem, MAGIC_OFFSET, SEEK_SET);
+
+ num = 0;
+ pr_info("Appending VMAs ... ");
+
+ /* All VMAs first */
+
+ list_for_each_entry(vma_area, &vma_area_list, list) {
+ ret = write(fd_core, &vma_area->vma, sizeof(vma_area->vma));
+ if (ret != sizeof(vma_area->vma)) {
+ pr_perror("\nUnable to write vma entry (%li written)\n", num);
+ goto err;
+ }
+ num++;
+ }
+
+ /* Ending marker */
+ memset(&ve, 0, sizeof(ve));
+ write_ptr_safe(fd_core, &ve, err);
+
+ pr_info("OK (%li written)\n", num);
+
+ num = 0;
+ num_anon = 0;
+
+ pr_info("Appending pages ... ");
+ while (1) {
+ ret = read(fd_pages, &va, sizeof(va));
+ if (!ret)
+ break;
+ if (ret != sizeof(va)) {
+ pr_perror("\nUnable to read VA of page (%li written)\n", num);
+ goto err;
+ }
+
+ /* Ending marker */
+ if (va == 0) {
+ write_ptr_safe(fd_core, &zero_page_entry, err);
+ write_ptr_safe(fd_pages_shmem, &zero_page_entry, err);
+ break;
+ }
+
+ vma_area = find_vma_by_addr((unsigned long)va);
+ if (!vma_area) {
+ pr_panic("\nA page with address %lx is unknown\n", va);
+ goto err;
+ }
+
+ /*
+ * Just in case if someone broke parasite page
+ * dumper code.
+ */
+ if (!vma_area_has(vma_area, VMA_AREA_REGULAR)) {
+ pr_panic("\nA page with address %lx has a wrong status\n", va);
+ goto err;
+ }
+
+ if (vma_area_has(vma_area, VMA_ANON_PRIVATE) ||
+ vma_area_has(vma_area, VMA_FILE_PRIVATE)) {
+ ret = write(fd_core, &va, sizeof(va));
+ ret += sendfile(fd_core, fd_pages, NULL, PAGE_SIZE);
+ if (ret != sizeof(va) + PAGE_SIZE) {
+ pr_perror("\nUnable to write VMA_FILE_PRIVATE|VMA_ANON_PRIVATE "
+ "page (%li, %li written)\n",
+ num, num_anon);
+ goto err;
+ }
+ num++;
+ } else if (vma_area_has(vma_area, VMA_ANON_SHARED)) {
+ ret = write(fd_pages_shmem, &va, sizeof(va));
+ ret += sendfile(fd_pages_shmem, fd_pages, NULL, PAGE_SIZE);
+ if (ret != sizeof(va) + PAGE_SIZE) {
+ pr_perror("\nUnable to write VMA_ANON_SHARED "
+ "page (%li, %li written)\n",
+ num, num_anon);
+ goto err;
+ }
+ num_anon++;
+ } else {
+ /* skip the page */
+ lseek(fd_pages, PAGE_SIZE, SEEK_CUR);
+ }
+ }
+ ret = 0;
+
+ pr_info("OK (%li written)\n", num + num_anon);
+
+err:
+ pr_info("----------------------------------------\n");
+ return ret;
+
+err_strno:
+ pr_perror("Error catched\n");
+ goto err;
+}
+
+static int dump_one_task(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ int ret = 0;
+
+ pr_info("========================================\n");
+ pr_info("Dumping task (pid: %d)\n", pid);
+ pr_info("========================================\n");
+
+ ret = collect_mappings(pid);
+ if (ret) {
+ pr_error("Collect mappings (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ ret = seize_task(pid);
+ if (ret) {
+ pr_error("Failed to seize task (pid: %d) with %d\n",
+ pid, ret);
+ goto err;
+ }
+
+ ret = dump_task_core_seized(pid, cr_fdset);
+ if (ret) {
+ pr_error("Dump core (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ parasite_ctl = parasite_infect_seized(pid, NULL, &vma_area_list);
+ if (!parasite_ctl) {
+ pr_error("Can't infect (pid: %d) with parasite\n", pid);
+ goto err;
+ }
+
+ ret = parasite_dump_pages_seized(parasite_ctl, &vma_area_list,
+ cr_fdset, CR_FD_PAGES);
+ if (ret) {
+ pr_error("Can't dump pages (pid: %d) with parasite\n", pid);
+ goto err;
+ }
+
+ ret = parasite_cure_seized(&parasite_ctl, &vma_area_list);
+ if (ret) {
+ pr_error("Can't cure (pid: %d) from parasite\n", pid);
+ goto err;
+ }
+
+ ret = unseize_task(pid);
+ if (ret) {
+ pr_error("Can't unsieze (pid: %d) task\n", pid);
+ goto err;
+ }
+
+ ret = dump_task_files(pid, cr_fdset);
+ if (ret) {
+ pr_error("Dump files (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ ret = dump_task_mappings(pid, cr_fdset);
+ if (ret) {
+ pr_error("Dump mappings (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ ret = finalize_core(pid, cr_fdset);
+ if (ret) {
+ pr_error("Finalizing core (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+err:
+ free_mappings();
+ return ret;
+}
+
+int cr_dump_tasks(pid_t pid, bool leader_only, int leave_stopped)
+{
+ struct cr_fdset *cr_fdset = NULL;
+ struct pstree_item *item;
+ int ret = -1;
+
+ if (!leader_only) {
+ pr_info("========================================\n");
+ pr_info("Dumping process group (pid: %d)\n", pid);
+ pr_info("========================================\n");
+ }
+
+ if (collect_pstree(pid))
+ goto err;
+
+ list_for_each_entry(item, &pstree_list, list) {
+ stop_task(item->pid);
+ if (leader_only)
+ break;
+ }
+
+ /* Dump the process tree first */
+ cr_fdset = alloc_cr_fdset(pid);
+ if (!cr_fdset)
+ goto err;
+
+ if (prep_cr_fdset_for_dump(cr_fdset, CR_FD_DESC_USE(CR_FD_PSTREE)))
+ goto err;
+ if (dump_pstree(pid, cr_fdset))
+ goto err;
+
+ close_cr_fdset(cr_fdset);
+ free_cr_fdset(&cr_fdset);
+
+ /* Now all other data */
+ list_for_each_entry(item, &pstree_list, list) {
+
+ cr_fdset = alloc_cr_fdset(item->pid);
+ if (!cr_fdset)
+ goto err;
+ if (prep_cr_fdset_for_dump(cr_fdset, CR_FD_DESC_NOPSTREE))
+ goto err;
+
+ if (dump_one_task(item->pid, cr_fdset))
+ goto err;
+
+ close_cr_fdset(cr_fdset);
+ free_cr_fdset(&cr_fdset);
+
+ if (leader_only)
+ break;
+ }
+ ret = 0;
+
+err:
+ if (!leave_stopped) {
+ list_for_each_entry(item, &pstree_list, list) {
+ continue_task(item->pid);
+ if (leader_only)
+ break;
+ }
+ }
+
+ free_pstree();
+ close_cr_fdset(cr_fdset);
+ free_cr_fdset(&cr_fdset);
+ return ret;
+}
diff --git a/cr-restore.c b/cr-restore.c
new file mode 100644
index 000000000..99f1ed1b7
--- /dev/null
+++ b/cr-restore.c
@@ -0,0 +1,1144 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <dirent.h>
+#include <string.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include <sched.h>
+
+#include <sys/sendfile.h>
+
+#include "compiler.h"
+#include "types.h"
+
+#include "image.h"
+#include "util.h"
+
+#include "crtools.h"
+
+struct fmap_fd {
+ struct fmap_fd *next;
+ unsigned long start;
+ int fd;
+};
+
+struct shmem_info {
+ unsigned long start;
+ unsigned long end;
+ unsigned long id;
+ int pid;
+ int real_pid;
+};
+
+struct pipe_info {
+ unsigned int id;
+ int pid;
+ int real_pid;
+ int read_fd;
+ int write_fd;
+ int users;
+};
+
+
+static struct fmap_fd *fmap_fds;
+
+static struct shmem_info *shmems;
+static int nr_shmems;
+
+static struct pipe_info *pipes;
+static int nr_pipes;
+
+static int restore_task_with_children(int my_pid, char *pstree_path);
+
+static void show_saved_shmems(void)
+{
+ int i;
+
+ pr_info("\tSaved shmems:\n");
+
+ for (i = 0; i < nr_shmems; i++)
+ pr_info("\t\tstart: %016lx id: %lx pid: %d\n",
+ shmems[i].start,
+ shmems[i].id,
+ shmems[i].pid);
+}
+
+static void show_saved_pipes(void)
+{
+ int i;
+
+ pr_info("\tSaved pipes:\n");
+ for (i = 0; i < nr_pipes; i++)
+ pr_info("\t\tid: %x -> pid: %d\n",
+ pipes[i].id,
+ pipes[i].pid);
+}
+
+static struct shmem_info *search_shmem(unsigned long addr, unsigned long id)
+{
+ struct shmem_info *si;
+ int i;
+
+ for (i = 0; i < nr_shmems; i++) {
+ si = shmems + i;
+ if (si->start <= addr && si->end >= addr && si->id == id)
+ return si;
+ }
+
+ return NULL;
+}
+
+static struct pipe_info *search_pipe(unsigned int pipeid)
+{
+ struct pipe_info *pi;
+ int i;
+
+ for (i = 0; i < nr_pipes; i++) {
+ pi = pipes + i;
+ if (pi->id == pipeid)
+ return pi;
+ }
+
+ return NULL;
+}
+
+static void shmem_update_real_pid(int vpid, int rpid)
+{
+ int i;
+
+ for (i = 0; i < nr_shmems; i++)
+ if (shmems[i].pid == vpid)
+ shmems[i].real_pid = rpid;
+}
+
+static int shmem_wait_and_open(struct shmem_info *si)
+{
+ /* FIXME - not good */
+ char path[128];
+ unsigned long time = 1000;
+
+ sleep(1);
+
+ while (si->real_pid == 0)
+ usleep(time);
+
+ sprintf(path, "/proc/%d/map_files/%lx-%lx",
+ si->real_pid, si->start, si->end);
+
+ while (1) {
+ int ret = open(path, O_RDWR);
+ if (ret > 0)
+ return ret;
+
+ if (ret < 0 && errno != ENOENT) {
+ perror(" Can't stat shmem");
+ return -1;
+ }
+
+ pr_info("Waiting for [%s] to appear\n", path);
+ if (time < 20000000)
+ time <<= 1;
+ usleep(time);
+ }
+}
+
+static int try_to_add_shmem(int pid, struct shmem_entry *e)
+{
+ int i;
+
+ for (i = 0; i < nr_shmems; i++) {
+ if (shmems[i].start != e->start ||
+ shmems[i].id != e->shmid)
+ continue;
+
+ if (shmems[i].end != e->end) {
+ pr_info("Bogus shmem\n");
+ return 1;
+ }
+
+ if (shmems[i].pid > pid)
+ shmems[i].pid = pid;
+
+ return 0;
+ }
+
+ if ((nr_shmems + 1) * sizeof(struct shmem_info) >= 4096) {
+ pr_panic("OOM storing shmems\n");
+ return 1;
+ }
+
+ shmems[nr_shmems].start = e->start;
+ shmems[nr_shmems].end = e->end;
+ shmems[nr_shmems].id = e->shmid;
+ shmems[nr_shmems].pid = pid;
+ shmems[nr_shmems].real_pid = 0;
+
+ nr_shmems++;
+
+ return 0;
+}
+
+static int try_to_add_pipe(int pid, struct pipe_entry *e, int p_fd)
+{
+ int i;
+
+ for (i = 0; i < nr_pipes; i++) {
+ if (pipes[i].id != e->pipeid)
+ continue;
+
+ if (pipes[i].pid > pid) {
+ pipes[i].pid = pid;
+ pipes[i].users++;
+ }
+
+ return 0;
+ }
+
+ if ((nr_pipes + 1) * sizeof(struct pipe_info) >= 4096) {
+ pr_info("OOM storing pipes\n");
+ return 1;
+ }
+
+ memset(&pipes[nr_pipes], 0, sizeof(pipes[nr_pipes]));
+
+ pipes[nr_pipes].id = e->pipeid;
+ pipes[nr_pipes].pid = pid;
+ pipes[nr_pipes].users = 1;
+
+ nr_pipes++;
+
+ return 0;
+}
+
+static int prepare_shmem_pid(int pid)
+{
+ char path[64];
+ int sh_fd;
+ u32 type = 0;
+
+ sprintf(path, "shmem-%d.img", pid);
+ sh_fd = open(path, O_RDONLY);
+ if (sh_fd < 0) {
+ perror("Can't open shmem info");
+ return 1;
+ }
+
+ read(sh_fd, &type, sizeof(type));
+ if (type != SHMEM_MAGIC) {
+ perror("Bad shmem magic");
+ return 1;
+ }
+
+ while (1) {
+ struct shmem_entry e;
+ int ret;
+
+ ret = read(sh_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+
+ if (ret != sizeof(e)) {
+ perror("Can't read shmem entry");
+ return 1;
+ }
+
+ if (try_to_add_shmem(pid, &e))
+ return 1;
+ }
+
+ close(sh_fd);
+ return 0;
+}
+
+static int prepare_pipes_pid(int pid)
+{
+ char path[64];
+ int p_fd;
+ u32 type = 0;
+
+ sprintf(path, "pipes-%d.img", pid);
+ p_fd = open(path, O_RDONLY);
+ if (p_fd < 0) {
+ perror("Can't open pipes image");
+ return 1;
+ }
+
+ read(p_fd, &type, sizeof(type));
+ if (type != PIPES_MAGIC) {
+ perror("Bad pipes magin");
+ return 1;
+ }
+
+ while (1) {
+ struct pipe_entry e;
+ int ret;
+
+ ret = read(p_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ fprintf(stderr, "Read pipes for %s failed %d of %li read\n",
+ path, ret, sizeof(e));
+ perror("Can't read pipes entry");
+ return 1;
+ }
+
+ if (try_to_add_pipe(pid, &e, p_fd))
+ return 1;
+
+ if (e.bytes)
+ lseek(p_fd, e.bytes, SEEK_CUR);
+ }
+
+ close(p_fd);
+ return 0;
+}
+
+static int prepare_shared(int ps_fd)
+{
+ pr_info("Preparing info about shared resources\n");
+
+ nr_shmems = 0;
+ shmems = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
+ if (shmems == MAP_FAILED) {
+ perror("Can't map shmems");
+ return 1;
+ }
+
+ pipes = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
+ if (pipes == MAP_FAILED) {
+ perror("Can't map pipes");
+ return 1;
+ }
+
+ while (1) {
+ struct pstree_entry e;
+ int ret;
+
+ ret = read(ps_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+
+ if (ret != sizeof(e)) {
+ perror("Can't read ps");
+ return 1;
+ }
+
+ if (prepare_shmem_pid(e.pid))
+ return 1;
+
+ if (prepare_pipes_pid(e.pid))
+ return 1;
+
+ lseek(ps_fd, e.nr_children * sizeof(u32), SEEK_CUR);
+ }
+
+ lseek(ps_fd, sizeof(u32), SEEK_SET);
+
+ show_saved_shmems();
+ show_saved_pipes();
+
+ return 0;
+}
+
+static struct fmap_fd *pop_fmap_fd(unsigned long start)
+{
+ struct fmap_fd **p, *r;
+
+ pr_info("Looking for %lx : ", start);
+
+ for (p = &fmap_fds; *p != NULL; p = &(*p)->next) {
+ if ((*p)->start != start)
+ continue;
+
+ r = *p;
+ *p = r->next;
+ pr_info("found\n");
+
+ return r;
+ }
+
+ pr_info("not found\n");
+ return NULL;
+}
+
+static int open_fe_fd(struct fdinfo_entry *fe, int fd)
+{
+ char path[PATH_MAX];
+ int tmp;
+
+ if (read(fd, path, fe->len) != fe->len) {
+ fprintf(stderr, "Error reading path");
+ return -1;
+ }
+
+ path[fe->len] = '\0';
+
+ tmp = open(path, fe->flags);
+ if (tmp < 0) {
+ pr_perror("Can't open file %s", path);
+ return -1;
+ }
+
+ lseek(tmp, fe->pos, SEEK_SET);
+
+ return tmp;
+}
+
+static int open_fd(int pid, struct fdinfo_entry *fe, int *cfd)
+{
+ int fd, tmp;
+
+ if (*cfd == (int)fe->addr) {
+ tmp = dup(*cfd);
+ if (tmp < 0) {
+ perror("Can't dup file");
+ return 1;
+ }
+
+ pr_info("%s: Dup for %d\n", __func__, tmp);
+
+ *cfd = tmp;
+ }
+
+ tmp = open_fe_fd(fe, *cfd);
+ if (tmp < 0)
+ return 1;
+
+ fd = reopen_fd_as((int)fe->addr, tmp);
+ if (fd < 0) {
+ perror("Can't dup");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int open_fmap(int pid, struct fdinfo_entry *fe, int fd)
+{
+ int tmp;
+ struct fmap_fd *new;
+
+ tmp = open_fe_fd(fe, fd);
+ if (tmp < 0)
+ return 1;
+
+ pr_info("%d:\t\tWill map %lx to %d\n", pid, (unsigned long)fe->addr, tmp);
+
+ new = malloc(sizeof(*new));
+ new->start = fe->addr;
+ new->fd = tmp;
+ new->next = fmap_fds;
+ fmap_fds = new;
+
+ return 0;
+}
+
+static int prepare_fds(int pid)
+{
+ u32 mag;
+ char path[64];
+ int fdinfo_fd;
+
+ pr_info("%d: Opening files\n", pid);
+
+ sprintf(path, "fdinfo-%d.img", pid);
+ fdinfo_fd = open(path, O_RDONLY);
+ if (fdinfo_fd < 0) {
+ perror("Can't open fdinfo");
+ return 1;
+ }
+
+ read(fdinfo_fd, &mag, 4);
+ if (mag != FDINFO_MAGIC) {
+ fprintf(stderr, "Bad file\n");
+ return 1;
+ }
+
+ while (1) {
+ int ret;
+ struct fdinfo_entry fe;
+
+ ret = read(fdinfo_fd, &fe, sizeof(fe));
+ if (ret == 0) {
+ close(fdinfo_fd);
+ return 0;
+ }
+
+ if (ret < 0) {
+ perror("Can't read file");
+ return 1;
+ }
+ if (ret != sizeof(fe)) {
+ fprintf(stderr, "Error reading\n");
+ return 1;
+ }
+
+ pr_info("\t%d: Got fd for %lx type %d namelen %d\n", pid,
+ (unsigned long)fe.addr, fe.type, fe.len);
+ switch (fe.type) {
+ case FDINFO_FD:
+ if (open_fd(pid, &fe, &fdinfo_fd))
+ return 1;
+
+ break;
+ case FDINFO_MAP:
+ if (open_fmap(pid, &fe, fdinfo_fd))
+ return 1;
+
+ break;
+ default:
+ fprintf(stderr, "Some bullshit in a file\n");
+ return 1;
+ }
+ }
+}
+
+struct shmem_to_id {
+ unsigned long addr;
+ unsigned long end;
+ unsigned long id;
+ struct shmem_to_id *next;
+};
+
+static struct shmem_to_id *my_shmem_ids;
+
+static unsigned long find_shmem_id(unsigned long addr)
+{
+ struct shmem_to_id *si;
+
+ for (si = my_shmem_ids; si != NULL; si = si->next)
+ if (si->addr <= addr && si->end >= addr)
+ return si->id;
+
+ return 0;
+}
+
+static void save_shmem_id(struct shmem_entry *e)
+{
+ struct shmem_to_id *si;
+
+ si = malloc(sizeof(*si));
+ si->addr = e->start;
+ si->end = e->end;
+ si->id = e->shmid;
+ si->next = my_shmem_ids;
+
+ my_shmem_ids = si;
+}
+
+static int prepare_shmem(int pid)
+{
+ char path[64];
+ int sh_fd;
+ u32 type = 0;
+
+ sprintf(path, "shmem-%d.img", pid);
+ sh_fd = open(path, O_RDONLY);
+ if (sh_fd < 0) {
+ perror("Can't open shmem info");
+ return 1;
+ }
+
+ read(sh_fd, &type, sizeof(type));
+ if (type != SHMEM_MAGIC) {
+ perror("Bad shmem magic");
+ return 1;
+ }
+
+ while (1) {
+ struct shmem_entry e;
+ int ret;
+
+ ret = read(sh_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ perror("Can't read shmem entry");
+ return 1;
+ }
+
+ save_shmem_id(&e);
+ }
+
+ close(sh_fd);
+ return 0;
+}
+
+static int try_fixup_file_map(int pid, struct vma_entry *vi, int fd)
+{
+ struct fmap_fd *fmfd;
+
+ fmfd = pop_fmap_fd(vi->start);
+ if (fmfd != NULL) {
+ pr_info("%d: Fixing %lx vma to %d fd\n", pid, vi->start, fmfd->fd);
+ lseek(fd, -sizeof(*vi), SEEK_CUR);
+ vi->fd = fmfd->fd;
+ if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) {
+ perror("Can't write img");
+ return 1;
+ }
+ free(fmfd);
+ }
+
+ return 0;
+}
+
+static int try_fixup_shared_map(int pid, struct vma_entry *vi, int fd)
+{
+ struct shmem_info *si;
+ unsigned long id;
+
+ id = find_shmem_id(vi->start);
+ if (id == 0)
+ return 0;
+
+ si = search_shmem(vi->start, id);
+ pr_info("%d: Search for %016lx shmem %p/%d\n", pid, vi->start, si, si ? si->pid : -1);
+
+ if (si == NULL) {
+ fprintf(stderr, "Can't find my shmem %016lx\n", vi->start);
+ return 1;
+ }
+
+ if (si->pid != pid) {
+ int sh_fd;
+
+ sh_fd = shmem_wait_and_open(si);
+ pr_info("%d: Fixing %lx vma to %lx/%d shmem -> %d\n", pid, vi->start, si->id, si->pid, sh_fd);
+ if (fd < 0) {
+ perror("Can't open shmem");
+ return 1;
+ }
+
+ lseek(fd, -sizeof(*vi), SEEK_CUR);
+ vi->fd = sh_fd;
+ if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) {
+ perror("Can't write img");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int fixup_vma_fds(int pid, int fd)
+{
+ int offset = sizeof(struct core_entry) + sizeof(u32);
+
+ pr_info("Seek for: %d bytes\n", offset);
+ lseek(fd, offset, SEEK_SET);
+
+ while (1) {
+ struct vma_entry vi;
+
+ if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) {
+ perror("Can't read");
+ return 1;
+ }
+
+ if (vi.start == 0 && vi.end == 0)
+ return 0;
+
+ if (!(vi.status & VMA_AREA_REGULAR))
+ continue;
+
+ if ((vi.status & VMA_FILE_SHARED) ||
+ (vi.status & VMA_FILE_PRIVATE)) {
+
+ pr_info("%d: Fixing %016lx-%016lx %016lx vma\n", pid, vi.start, vi.end, vi.pgoff);
+ if (try_fixup_file_map(pid, &vi, fd))
+ return 1;
+ }
+
+ if (vi.status & VMA_ANON_SHARED) {
+ if (try_fixup_shared_map(pid, &vi, fd))
+ return 1;
+ }
+ }
+}
+
+static inline int should_restore_page(int pid, unsigned long vaddr)
+{
+ struct shmem_info *si;
+ unsigned long id;
+
+ id = find_shmem_id(vaddr);
+ if (id == 0)
+ return 1;
+
+ si = search_shmem(vaddr, id);
+ return si->pid == pid;
+}
+
+static char zpage[PAGE_SIZE];
+
+static int fixup_pages_data(int pid, int fd)
+{
+ char path[128];
+ int shfd;
+ u32 mag;
+ u64 vaddr;
+
+ sprintf(path, "pages-shmem-%d.img", pid);
+ shfd = open(path, O_RDONLY);
+ if (shfd < 0) {
+ perror("Can't open shmem image");
+ return 1;
+ }
+
+ read(shfd, &mag, sizeof(mag));
+ if (mag != PAGES_MAGIC) {
+ fprintf(stderr, "Bad shmem image\n");
+ return 1;
+ }
+
+ /* Find out the last page, which is zero one */
+ lseek(fd, -sizeof(struct page_entry), SEEK_END);
+ read(fd, &vaddr, sizeof(vaddr));
+ if (vaddr != 0) {
+ pr_info("SHIT %lx\n", (unsigned long)vaddr);
+ return 1;
+ }
+ lseek(fd, -sizeof(struct page_entry), SEEK_END);
+
+ while (1) {
+ int ret;
+
+ ret = read(shfd, &vaddr, sizeof(vaddr));
+ if (ret == 0)
+ break;
+
+ if (ret < 0 || ret != sizeof(vaddr)) {
+ perror("Can't read vaddr");
+ return 1;
+ }
+
+ if (vaddr == 0)
+ break;
+
+ if (!should_restore_page(pid, vaddr)) {
+ lseek(shfd, PAGE_SIZE, SEEK_CUR);
+ continue;
+ }
+
+ write(fd, &vaddr, sizeof(vaddr));
+ sendfile(fd, shfd, NULL, PAGE_SIZE);
+ }
+
+ close(shfd);
+ vaddr = 0;
+ write(fd, &vaddr, sizeof(vaddr));
+ write(fd, zpage, sizeof(zpage));
+
+ return 0;
+}
+
+static int prepare_image_maps(int fd, int pid)
+{
+ pr_info("%d: Fixing maps before executing image\n", pid);
+
+ if (fixup_vma_fds(pid, fd))
+ return 1;
+
+ if (fixup_pages_data(pid, fd))
+ return 1;
+
+ //close(fd);
+ return 0;
+}
+
+static int execute_image(int pid)
+{
+ char path[128], elf_path[128];
+ int fd, fd_new;
+ struct stat buf;
+
+ sprintf(path, "core-%d.img", pid);
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ perror("Can't open exec image");
+ return 1;
+ }
+
+ if (fstat(fd, &buf)) {
+ perror("Can't stat");
+ return 1;
+ }
+
+ sprintf(path, "core-%d.img.out", pid);
+ unlink(path);
+
+ fd_new = open(path, O_RDWR | O_CREAT | O_EXCL, 0700);
+ if (fd_new < 0) {
+ perror("Can't open new image");
+ return 1;
+ }
+
+ sprintf(elf_path, "core-%d.elf", pid);
+ unlink(elf_path);
+
+ pr_info("%d: Preparing execution image %s (%li bytes)\n", pid, path, buf.st_size);
+ if (sendfile(fd_new, fd, NULL, buf.st_size) != buf.st_size) {
+ pr_perror("sendfile failed\n");
+ return 1;
+ }
+ close(fd);
+
+ if (fchmod(fd_new, 0700)) {
+ perror("Can't prepare exec image");
+ return 1;
+ }
+
+ if (fstat(fd_new, &buf)) {
+ perror("Can't stat");
+ return 1;
+ }
+
+ pr_info("fd_new: %li bytes\n", buf.st_size);
+
+ if (prepare_image_maps(fd_new, pid))
+ return 1;
+
+ sync();
+
+ if (convert_to_elf(elf_path, fd_new))
+ return 1;
+
+ sync();
+ close(fd_new);
+
+ pr_info("%d/%d EXEC ELF-IMAGE\n", pid, getpid());
+ return execl(elf_path, elf_path, NULL);
+}
+
+static int create_pipe(int pid, struct pipe_entry *e, struct pipe_info *pi, int pipes_fd)
+{
+ int pfd[2], tmp;
+ unsigned long time = 1000;
+
+ pr_info("\t%d: Creating pipe %x\n", pid, e->pipeid);
+
+ if (pipe(pfd) < 0) {
+ perror("Can't create pipe");
+ return 1;
+ }
+
+ if (e->bytes) {
+ pr_info("\t%d: Splicing data to %d\n", pid, pfd[1]);
+
+ tmp = splice(pipes_fd, NULL, pfd[1], NULL, e->bytes, 0);
+ if (tmp != e->bytes) {
+ fprintf(stderr, "Wanted to restore %d bytes, but got %d\n",
+ e->bytes, tmp);
+ if (tmp < 0)
+ perror("Error splicing data");
+ return 1;
+ }
+ }
+
+ pi->read_fd = pfd[0];
+ pi->write_fd = pfd[1];
+ pi->real_pid = getpid();
+
+ pr_info("\t%d: Done, waiting for others on %d pid with r:%d w:%d\n",
+ pid, pi->real_pid, pfd[0], pfd[1]);
+
+ while (1) {
+ if (pi->users == 1) /* only I left */
+ break;
+
+ pr_info("\t%d: Waiting for %x pipe to attach (%d users left)\n",
+ pid, e->pipeid, pi->users - 1);
+ if (time < 20000000)
+ time <<= 1;
+ usleep(time);
+ }
+
+ pr_info("\t%d: All is ok - reopening pipe for %d\n", pid, e->fd);
+ if (e->flags & O_WRONLY) {
+ close(pfd[0]);
+ tmp = reopen_fd_as(e->fd, pfd[1]);
+ } else {
+ close(pfd[1]);
+ tmp = reopen_fd_as(e->fd, pfd[0]);
+ }
+
+ if (tmp < 0) {
+ perror("Can't dup pipe fd");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int attach_pipe(int pid, struct pipe_entry *e, struct pipe_info *pi)
+{
+ char path[128];
+ int tmp, fd;
+
+ pr_info("\t%d: Wating for pipe %x to appear\n", pid, e->pipeid);
+
+ while (pi->real_pid == 0)
+ usleep(1000);
+
+ if (e->flags & O_WRONLY)
+ tmp = pi->write_fd;
+ else
+ tmp = pi->read_fd;
+
+ sprintf(path, "/proc/%d/fd/%d", pi->real_pid, tmp);
+ pr_info("\t%d: Attaching pipe %s\n", pid, path);
+
+ fd = open(path, e->flags);
+ if (fd < 0) {
+ perror("Can't attach pipe");
+ return 1;
+ }
+
+ pr_info("\t%d: Done, reopening for %d\n", pid, e->fd);
+ pi->users--;
+ tmp = reopen_fd_as(e->fd, fd);
+ if (tmp < 0) {
+ perror("Can't dup to attach pipe");
+ return 1;
+ }
+
+ return 0;
+
+}
+
+static int open_pipe(int pid, struct pipe_entry *e, int *pipes_fd)
+{
+ struct pipe_info *pi;
+
+ pr_info("\t%d: Opening pipe %x on fd %d\n", pid, e->pipeid, e->fd);
+ if (e->fd == *pipes_fd) {
+ int tmp;
+
+ tmp = dup(*pipes_fd);
+ if (tmp < 0) {
+ perror("Can't dup file");
+ return 1;
+ }
+
+ *pipes_fd = tmp;
+ }
+
+ pi = search_pipe(e->pipeid);
+ if (pi == NULL) {
+ fprintf(stderr, "BUG: can't find my pipe %x\n", e->pipeid);
+ return 1;
+ }
+
+ if (pi->pid == pid)
+ return create_pipe(pid, e, pi, *pipes_fd);
+ else
+ return attach_pipe(pid, e, pi);
+}
+
+static int prepare_pipes(int pid)
+{
+ char path[64];
+ int pipes_fd;
+ u32 type = 0;
+
+ pr_info("%d: Opening pipes\n", pid);
+
+ sprintf(path, "pipes-%d.img", pid);
+ pipes_fd = open(path, O_RDONLY);
+ if (pipes_fd < 0) {
+ perror("Can't open pipes img");
+ return 1;
+ }
+
+ read(pipes_fd, &type, sizeof(type));
+ if (type != PIPES_MAGIC) {
+ perror("Bad pipes file");
+ return 1;
+ }
+
+ while (1) {
+ struct pipe_entry e;
+ int ret;
+
+ ret = read(pipes_fd, &e, sizeof(e));
+ if (ret == 0) {
+ close(pipes_fd);
+ return 0;
+ }
+ if (ret != sizeof(e)) {
+ perror("Bad pipes entry");
+ return 1;
+ }
+
+ if (open_pipe(pid, &e, &pipes_fd))
+ return 1;
+ }
+}
+
+static int restore_one_task(int pid)
+{
+ pr_info("%d: Restoring resources\n", pid);
+
+ if (prepare_pipes(pid))
+ return 1;
+
+ if (prepare_fds(pid))
+ return 1;
+
+ if (prepare_shmem(pid))
+ return 1;
+
+ return execute_image(pid);
+}
+
+static int do_child(void *arg)
+{
+ return restore_task_with_children(getpid(), arg);
+}
+
+static inline int fork_with_pid(int pid, char *pstree_path)
+{
+ int ret = 0;
+ void *stack;
+
+ stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0);
+ if (stack == MAP_FAILED) {
+ pr_perror("mmap failed");
+ return -1;
+ }
+
+ stack += 4 * 4096;
+ ret = clone(do_child, stack, SIGCHLD | CLONE_CHILD_USEPID, pstree_path, NULL, NULL, &pid);
+ if (ret < 0)
+ pr_perror("clone failed\n");
+
+ return ret;
+}
+
+static int restore_task_with_children(int my_pid, char *pstree_path)
+{
+ int *pids;
+ int fd, ret, i;
+ struct pstree_entry e;
+
+ pr_info("%d: Starting restore\n", my_pid);
+
+ fd = open(pstree_path, O_RDONLY);
+ if (fd < 0) {
+ perror("Can't reopen pstree image");
+ exit(1);
+ }
+
+ lseek(fd, sizeof(u32), SEEK_SET);
+ while (1) {
+ ret = read(fd, &e, sizeof(e));
+ if (ret != sizeof(e)) {
+ fprintf(stderr, "%d: Read returned %d\n", my_pid, ret);
+ if (ret < 0)
+ perror("Can't read pstree");
+ exit(1);
+ }
+
+ if (e.pid != my_pid) {
+ lseek(fd, e.nr_children * sizeof(u32), SEEK_CUR);
+ continue;
+ }
+
+ break;
+ }
+
+ if (e.nr_children > 0) {
+ i = e.nr_children * sizeof(int);
+ pids = malloc(i);
+ ret = read(fd, pids, i);
+ if (ret != i) {
+ perror("Can't read children pids");
+ exit(1);
+ }
+
+ close(fd);
+
+ pr_info("%d: Restoring %d children:\n", my_pid, e.nr_children);
+ for (i = 0; i < e.nr_children; i++) {
+ pr_info("\tFork %d from %d\n", pids[i], my_pid);
+ ret = fork_with_pid(pids[i], pstree_path);
+ if (ret < 0)
+ exit(1);
+ }
+ } else
+ close(fd);
+
+ shmem_update_real_pid(my_pid, getpid());
+
+ return restore_one_task(my_pid);
+}
+
+static int restore_root_task(char *pstree_path, int fd)
+{
+ struct pstree_entry e;
+ int ret;
+
+ ret = read(fd, &e, sizeof(e));
+ if (ret != sizeof(e)) {
+ perror("Can't read root pstree entry");
+ return 1;
+ }
+
+ close(fd);
+
+ pr_info("Forking root with %d pid\n", e.pid);
+ ret = fork_with_pid(e.pid, pstree_path);
+ if (ret < 0)
+ return 1;
+
+ wait(NULL);
+ return 0;
+}
+
+static int restore_all_tasks(pid_t pid)
+{
+ char path[128];
+ int pstree_fd;
+ u32 type = 0;
+
+ sprintf(path, "pstree-%d.img", pid);
+ pstree_fd = open(path, O_RDONLY);
+ if (pstree_fd < 0) {
+ perror("Can't open pstree image");
+ return 1;
+ }
+
+ read(pstree_fd, &type, sizeof(type));
+ if (type != PSTREE_MAGIC) {
+ perror("Bad pstree magic");
+ return 1;
+ }
+
+ if (prepare_shared(pstree_fd))
+ return 1;
+
+ return restore_root_task(path, pstree_fd);
+}
+
+int cr_restore_tasks(pid_t pid, bool leader_only, int leave_stopped)
+{
+ if (leader_only)
+ return restore_one_task(pid);
+ return restore_all_tasks(pid);
+}
diff --git a/cr-show.c b/cr-show.c
new file mode 100644
index 000000000..dd3128f1d
--- /dev/null
+++ b/cr-show.c
@@ -0,0 +1,389 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include "types.h"
+#include "list.h"
+
+#include "compiler.h"
+#include "crtools.h"
+#include "syscall.h"
+#include "util.h"
+
+#include "image.h"
+
+#ifndef CONFIG_X86_64
+# error No x86-32 support yet
+#endif
+
+#define pr_regs4(s, n1, n2, n3, n4) \
+ pr_info("%8s: %16lx " \
+ "%8s: %16lx " \
+ "%8s: %16lx " \
+ "%8s: %16lx\n", \
+ #n1, s.n1, \
+ #n2, s.n2, \
+ #n3, s.n3, \
+ #n4, s.n4)
+
+#define pr_regs3(s, n1, n2, n3) \
+ pr_info("%8s: %16lx " \
+ "%8s: %16lx " \
+ "%8s: %16lx\n", \
+ #n1, s.n1, \
+ #n2, s.n2, \
+ #n3, s.n3)
+
+static char local_buf[PAGE_SIZE];
+static LIST_HEAD(pstree_list);
+
+/* FIXME: same as dump -- unify */
+static void free_pstree(void)
+{
+ struct pstree_item *item, *p;
+
+ list_for_each_entry_safe(item, p, &pstree_list, list) {
+ xfree(item->children);
+ xfree(item);
+ }
+
+ INIT_LIST_HEAD(&pstree_list);
+}
+
+static void show_regs(struct cr_fdset *cr_fdset)
+{
+ struct user_regs_entry regs;
+ struct desc_struct tls;
+ int fd_core, i;
+
+ fd_core = cr_fdset->desc[CR_FD_CORE].fd;
+ if (fd_core < 0)
+ goto err;
+
+ pr_info("\n\t---[GP registers set]---\n");
+
+ lseek(fd_core, GET_FILE_OFF(struct core_entry, gpregs), SEEK_SET);
+
+ read_ptr_safe(fd_core, &regs, err);
+
+ pr_regs4(regs, cs, ip, ds, es);
+ pr_regs4(regs, ss, sp, fs, gs);
+ pr_regs4(regs, di, si, dx, cx);
+ pr_regs4(regs, ax, r8, r9, r10);
+ pr_regs4(regs, r11, r12, r13, r14);
+ pr_regs3(regs, r15, bp, bx);
+ pr_regs4(regs, orig_ax, flags, fs_base, gs_base);
+
+ pr_info("\n\t---[TLS area]---\n");
+
+ lseek(fd_core, GET_FILE_OFF(struct core_entry, tls_array), SEEK_SET);
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
+ read_ptr_safe(fd_core, &tls, err);
+ pr_info("tls[%2i] = %x %x\n", i, tls.a, tls.b);
+ }
+
+err:
+ return;
+}
+
+static void show_files(struct cr_fdset *cr_fdset)
+{
+ struct fdinfo_entry e;
+ int fd_files, ret;
+
+ pr_info("\n");
+ pr_info("CR_FD_FDINFO: %s\n", cr_fdset->desc[CR_FD_FDINFO].name);
+ pr_info("----------------------------------------\n");
+
+ fd_files = cr_fdset->desc[CR_FD_FDINFO].fd;
+
+ lseek(fd_files, MAGIC_OFFSET, SEEK_SET);
+
+ while (1) {
+ ret = read(fd_files, &e, sizeof(e));
+ if (!ret)
+ goto err;
+ if (ret != sizeof(e)) {
+ pr_perror("Can't read fdinfo entry");
+ goto err;
+ }
+
+ if (e.len) {
+ ret = read(fd_files, local_buf, e.len);
+ if (ret != e.len) {
+ pr_perror("Can't read %d bytes\n", e.len);
+ goto err;
+ }
+ local_buf[e.len] = 0;
+ pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx --> %s\n",
+ e.type, e.len, e.flags, e.pos, e.addr, local_buf);
+ } else
+ pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx\n",
+ e.type, e.len, e.flags, e.pos, e.addr);
+ }
+
+err:
+ pr_info("----------------------------------------\n");
+}
+
+static void show_pipes(struct cr_fdset *cr_fdset)
+{
+ struct pipe_entry e;
+ int fd_pipes, ret;
+
+ pr_info("\n");
+ pr_info("CR_FD_PIPES: %s\n", cr_fdset->desc[CR_FD_PIPES].name);
+ pr_info("----------------------------------------\n");
+
+ fd_pipes = cr_fdset->desc[CR_FD_PIPES].fd;
+
+ lseek(fd_pipes, MAGIC_OFFSET, SEEK_SET);
+
+ while (1) {
+ ret = read(fd_pipes, &e, sizeof(e));
+ if (!ret)
+ goto err;
+ if (ret != sizeof(e)) {
+ pr_perror("Can't read pipe entry\n");
+ goto err;
+ }
+ pr_info("fd: %8lx pipeid: %8lx flags: %8lx bytes: %8lx\n",
+ e.fd, e.pipeid, e.flags, e.bytes);
+ if (e.bytes)
+ lseek(fd_pipes, e.bytes, SEEK_CUR);
+ }
+
+err:
+ pr_info("----------------------------------------\n");
+}
+
+static void show_core(struct cr_fdset *cr_fdset)
+{
+ struct vma_area vma_area = {};
+ struct vma_entry ve;
+ int fd_core, ret;
+ u64 va;
+
+ pr_info("\n");
+ pr_info("CR_FD_CORE: %s\n", cr_fdset->desc[CR_FD_CORE].name);
+ pr_info("----------------------------------------\n");
+
+ fd_core = cr_fdset->desc[CR_FD_CORE].fd;
+ if (fd_core < 0)
+ goto out;
+
+ show_regs(cr_fdset);
+
+ lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
+
+ /*
+ * Start with VMA, then pages.
+ */
+ pr_info("\n\t---[VMA areas]---\n");
+ while (1) {
+ ret = read(fd_core, &ve, sizeof(ve));
+ if (!ret)
+ break;
+ if (ret != sizeof(ve)) {
+ pr_perror("Unable to read VMA\n");
+ goto out;
+ }
+
+ if (is_ending_vma(&ve)) {
+ pr_info("\n\t---[Pages]---\n");
+ while (1) {
+ ret = read(fd_core, &va, sizeof(va));
+ if (!ret)
+ goto out;
+ if (ret != sizeof(va)) {
+ pr_perror("Unable to read VA\n");
+ goto out;
+ }
+ if (va == 0)
+ goto out;
+ pr_info("page va: %16lx\n", va);
+ lseek(fd_core, PAGE_SIZE, SEEK_CUR);
+ }
+ }
+
+ /* Simply in a sake of fancy printing */
+ vma_area.vma = ve;
+ pr_info_vma(&vma_area);
+ }
+
+out:
+ pr_info("----------------------------------------\n");
+}
+
+static void show_pstree_from_file(int fd, char *name)
+{
+ int ret;
+
+ pr_info("\n");
+ pr_info("CR_FD_PSTREE: %s\n", name);
+ pr_info("----------------------------------------\n");
+
+ while (1) {
+ struct pstree_entry e;
+ unsigned long i;
+ u32 child_pid;
+
+ ret = read(fd, &e, sizeof(e));
+ if (!ret)
+ break;
+ if (ret != sizeof(e)) {
+ pr_perror("Bad pstree entry");
+ break;
+ }
+
+ pr_info("Process %d number of children: %d\n",
+ e.pid, e.nr_children);
+
+ for (i = 0; i < e.nr_children; i++) {
+ ret = read(fd, &child_pid,
+ sizeof(child_pid));
+ pr_info(" %d", child_pid);
+ }
+ if (e.nr_children)
+ pr_info("\n");
+ }
+
+ pr_info("----------------------------------------\n");
+}
+
+static void show_pstree(struct list_head *head, char *name)
+{
+ struct pstree_item *item;
+ int i;
+
+ pr_info("\n");
+ pr_info("CR_FD_PSTREE: %s\n", name);
+ pr_info("----------------------------------------\n");
+
+ list_for_each_entry(item, head, list) {
+ pr_info("Process %d number of children: %d\n",
+ item->pid, item->nr_children);
+ for (i = 0; i < item->nr_children; i++)
+ pr_info(" %d", item->children[i]);
+ if (item->nr_children)
+ pr_info("\n");
+ }
+
+ pr_info("----------------------------------------\n");
+}
+
+static int collect_pstree(pid_t pid, struct cr_fdset *cr_fdset)
+{
+ int fd = cr_fdset->desc[CR_FD_PSTREE].fd;
+ struct pstree_item *item = NULL;
+ struct pstree_entry e;
+ int ret = -1;
+
+ for (;;) {
+ size_t size;
+
+ ret = read(fd, &e, sizeof(e));
+ if (ret && ret != sizeof(e)) {
+ pr_perror("Wrong pstree entry\n");
+ goto err;
+ }
+
+ if (!ret)
+ break;
+
+ item = xmalloc(sizeof(*item));
+ if (!item)
+ goto err;
+
+ size = sizeof(u32) * e.nr_children;
+
+ item->pid = e.pid;
+ item->nr_children = e.nr_children;
+ item->children = xmalloc(size);
+
+ if (!item->children) {
+ pr_error("No memory for children pids\n");
+ goto err;
+ }
+
+ ret = read(fd, item->children, size);
+ if (ret != size) {
+ pr_error("An error in reading children pids\n");
+ xfree(item->children);
+ goto err;
+ }
+
+ list_add_tail(&item->list, &pstree_list);
+ }
+
+ item = NULL;
+ ret = 0;
+
+err:
+ xfree(item);
+ return ret;
+}
+
+int cr_show(unsigned long pid, bool leader_only)
+{
+ struct cr_fdset *cr_fdset;
+ struct pstree_item *item;
+ int i, ret = -1;
+
+ cr_fdset = alloc_cr_fdset(pid);
+ if (!cr_fdset)
+ goto out;
+
+ ret = prep_cr_fdset_for_restore(cr_fdset, CR_FD_DESC_ALL);
+ if (ret)
+ goto out;
+
+ ret = collect_pstree(pid, cr_fdset);
+ if (ret)
+ goto out;
+
+ show_pstree(&pstree_list, cr_fdset->desc[CR_FD_PSTREE].name);
+
+ close_cr_fdset(cr_fdset);
+ free_cr_fdset(&cr_fdset);
+
+ list_for_each_entry(item, &pstree_list, list) {
+
+ cr_fdset = alloc_cr_fdset(item->pid);
+ if (!cr_fdset)
+ goto out;
+
+ ret = prep_cr_fdset_for_restore(cr_fdset, CR_FD_DESC_NOPSTREE);
+ if (ret)
+ goto out;
+
+ show_core(cr_fdset);
+ show_pipes(cr_fdset);
+ show_files(cr_fdset);
+
+ if (leader_only)
+ break;
+ }
+
+out:
+ free_pstree();
+ close_cr_fdset(cr_fdset);
+ free_cr_fdset(&cr_fdset);
+ return ret;
+}
diff --git a/crtools.c b/crtools.c
new file mode 100644
index 000000000..c76ca337e
--- /dev/null
+++ b/crtools.c
@@ -0,0 +1,280 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <dirent.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <sys/sendfile.h>
+
+#include "types.h"
+#include "list.h"
+
+#include "compiler.h"
+#include "crtools.h"
+#include "util.h"
+
+struct page_entry zero_page_entry;
+
+static struct cr_fd_desc_tmpl template[CR_FD_MAX] = {
+ [CR_FD_FDINFO] = {
+ .fmt = "fdinfo-%li.img",
+ .magic = FDINFO_MAGIC,
+ },
+ [CR_FD_PAGES] = {
+ .fmt = "pages-%li.img",
+ .magic = PAGES_MAGIC,
+ },
+ [CR_FD_PAGES_SHMEM] = {
+ .fmt = "pages-shmem-%li.img",
+ .magic = PAGES_MAGIC,
+ },
+ [CR_FD_CORE] = {
+ .fmt = "core-%li.img",
+ .magic = CORE_MAGIC,
+ },
+ [CR_FD_PIPES] = {
+ .fmt = "pipes-%li.img",
+ .magic = PIPES_MAGIC,
+ },
+ [CR_FD_PSTREE] = {
+ .fmt = "pstree-%li.img",
+ .magic = PSTREE_MAGIC,
+ },
+ [CR_FD_SHMEM] = {
+ .fmt = "shmem-%li.img",
+ .magic = SHMEM_MAGIC,
+ },
+};
+
+struct cr_fdset *alloc_cr_fdset(pid_t pid)
+{
+ struct cr_fdset *cr_fdset;
+ unsigned int i;
+
+ cr_fdset = xzalloc(sizeof(*cr_fdset));
+ if (!cr_fdset)
+ goto err;
+
+ for (i = 0; i < CR_FD_MAX; i++) {
+ cr_fdset->desc[i].tmpl = &template[i];
+ snprintf(cr_fdset->desc[i].name,
+ sizeof(cr_fdset->desc[i].name),
+ cr_fdset->desc[i].tmpl->fmt,
+ (long)pid);
+ cr_fdset->desc[i].fd = -1;
+ }
+
+err:
+ return cr_fdset;
+}
+
+int prep_cr_fdset_for_dump(struct cr_fdset *cr_fdset,
+ unsigned long use_mask)
+{
+ unsigned int i;
+ u32 magic;
+ int ret = -1;
+
+ if (!cr_fdset)
+ goto err;
+
+ cr_fdset->use_mask = use_mask;
+
+ for (i = 0; i < CR_FD_MAX; i++) {
+ if (!(use_mask & CR_FD_DESC_USE(i)))
+ continue;
+
+ ret = unlink(cr_fdset->desc[i].name);
+ if (ret && errno != ENOENT) {
+ pr_perror("Unable to unlink %s (%s)\n",
+ cr_fdset->desc[i].name,
+ strerror(errno));
+ goto err;
+ } else
+ ret = -1;
+ cr_fdset->desc[i].fd = open(cr_fdset->desc[i].name,
+ O_RDWR | O_CREAT | O_EXCL,
+ CR_FD_PERM);
+ if (cr_fdset->desc[i].fd < 0) {
+ pr_perror("Unable to open %s (%s)\n",
+ cr_fdset->desc[i].name,
+ strerror(errno));
+ goto err;
+ }
+
+ pr_debug("Opened %s with %d\n",
+ cr_fdset->desc[i].name,
+ cr_fdset->desc[i].fd);
+
+ magic = cr_fdset->desc[i].tmpl->magic;
+ write_ptr_safe(cr_fdset->desc[i].fd, &magic, err);
+
+ /*
+ * Make sure it's on disk since we might
+ * need to re-open files in parasite.
+ */
+ fsync(cr_fdset->desc[i].fd);
+ }
+ ret = 0;
+err:
+ return ret;
+}
+
+int prep_cr_fdset_for_restore(struct cr_fdset *cr_fdset,
+ unsigned long use_mask)
+{
+ unsigned int i;
+ int ret = -1;
+ u32 magic;
+
+ if (!cr_fdset)
+ goto err;
+
+ cr_fdset->use_mask = use_mask;
+
+ for (i = 0; i < CR_FD_MAX; i++) {
+ if (!(use_mask & CR_FD_DESC_USE(i)))
+ continue;
+
+ cr_fdset->desc[i].fd = open(cr_fdset->desc[i].name,
+ O_RDWR, CR_FD_PERM);
+ if (cr_fdset->desc[i].fd < 0) {
+ pr_perror("Unable to open %s (%s)\n",
+ cr_fdset->desc[i].name,
+ strerror(errno));
+ goto err;
+ }
+
+ pr_debug("Opened %s with %d\n",
+ cr_fdset->desc[i].name,
+ cr_fdset->desc[i].fd);
+
+ read_ptr_safe(cr_fdset->desc[i].fd, &magic, err);
+ if (magic != cr_fdset->desc[i].tmpl->magic) {
+ pr_error("Magic doesn't match for %s\n",
+ cr_fdset->desc[i].name);
+ goto err;
+ }
+
+ }
+ ret = 0;
+err:
+ return ret;
+}
+
+void close_cr_fdset(struct cr_fdset *cr_fdset)
+{
+ unsigned int i;
+
+ if (!cr_fdset)
+ return;
+
+ for (i = 0; i < CR_FD_MAX; i++) {
+ if (!(cr_fdset->use_mask & CR_FD_DESC_USE(i)))
+ continue;
+
+ if (cr_fdset->desc[i].fd >= 0) {
+ pr_debug("Closed %s with %d\n",
+ cr_fdset->desc[i].name,
+ cr_fdset->desc[i].fd);
+ close(cr_fdset->desc[i].fd);
+ cr_fdset->desc[i].fd = -1;
+ }
+ }
+}
+
+void free_cr_fdset(struct cr_fdset **cr_fdset)
+{
+ if (cr_fdset && *cr_fdset) {
+ free(*cr_fdset);
+ *cr_fdset = NULL;
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ pid_t pid;
+ int ret = -1;
+
+ BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE);
+
+ if (argc < 3)
+ goto usage;
+
+ memset(&zero_page_entry, 0, sizeof(zero_page_entry));
+
+ if (!strcmp(argv[1], "dump")) {
+ bool leader_only;
+
+ switch (argv[2][1]) {
+ case 'p':
+ pid = atol(argv[3]);
+ leader_only = true;
+ break;
+ case 't':
+ pid = atol(argv[3]);
+ leader_only = false;
+ break;
+ default:
+ goto usage;
+ }
+
+ ret = cr_dump_tasks(pid, leader_only, 1);
+
+ } else if (!strcmp(argv[1], "restore")) {
+ bool leader_only;
+
+ switch (argv[2][1]) {
+ case 'p':
+ pid = atol(argv[3]);
+ leader_only = true;
+ break;
+ case 't':
+ pid = atol(argv[3]);
+ leader_only = false;
+ break;
+ default:
+ goto usage;
+ }
+
+ ret = cr_restore_tasks(pid, leader_only, 1);
+
+ } else if (!strcmp(argv[1], "show")) {
+ bool leader_only = true;
+
+ switch (argv[2][1]) {
+ case 'p':
+ leader_only = true;
+ pid = atol(argv[3]);
+ break;
+ case 't':
+ leader_only = false;
+ pid = atol(argv[3]);
+ break;
+ default:
+ goto usage;
+ }
+
+ ret = cr_show(pid, leader_only);
+
+ } else
+ goto usage;
+
+ return ret;
+
+usage:
+ printk("\nUsage:\n");
+ printk("\tcrtools (dump|show|restore) (-p|-t) pid\n\n");
+ return -1;
+}
diff --git a/elf.c b/elf.c
new file mode 100644
index 000000000..91f967abc
--- /dev/null
+++ b/elf.c
@@ -0,0 +1,213 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include <sys/sendfile.h>
+
+#include "types.h"
+#include "list.h"
+
+#include "compiler.h"
+#include "crtools.h"
+#include "syscall.h"
+#include "util.h"
+
+#include "image.h"
+#include "elf.h"
+
+#define ELF_MAX_PHDR ((65536U / sizeof(Elf64_Phdr)) - 1)
+#define ELF_MAX_PAGES (1 << 10)
+
+/*
+ * Convert the c/r core file into elf
+ * executable, the kernel will handle it.
+ */
+int convert_to_elf(char *elf_path, int fd_core)
+{
+ Elf64_Ehdr elf_ehdr;
+ Elf64_Phdr elf_phdr;
+
+ Elf64_Half e_phnum = 0;
+ Elf64_Addr e_entry = 0;
+
+ struct page_entry page_entry;
+ unsigned long nrpages = 0;
+ struct core_entry core;
+ struct vma_area area;
+ struct vma_entry vma;
+ u64 va;
+
+ unsigned long phoff = 0;
+ unsigned long phoff_regs, phoff_pages;
+
+ int fd_elf;
+ int ret = -1;
+
+ fd_elf = open(elf_path, O_RDWR | O_CREAT | O_EXCL, 0700);
+ if (fd_elf < 0) {
+ pr_perror("Can't open %s\n", elf_path);
+ goto err;
+ }
+
+ memset(&elf_ehdr, 0, sizeof(elf_ehdr));
+ memset(&area, 0, sizeof(area));
+
+ memcpy(elf_ehdr.e_ident, ELFMAG, SELFMAG);
+ elf_ehdr.e_ident[EI_CLASS] = ELFCLASS64;
+ elf_ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
+ elf_ehdr.e_ident[EI_VERSION] = EV_CURRENT;
+
+ elf_ehdr.e_type = ET_CKPT;
+ elf_ehdr.e_machine = EM_X86_64;
+ elf_ehdr.e_version = EV_CURRENT;
+ elf_ehdr.e_phoff = sizeof(elf_ehdr);
+ elf_ehdr.e_ehsize = sizeof(elf_ehdr);
+ elf_ehdr.e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Get EP */
+ lseek(fd_core, MAGIC_OFFSET, SEEK_SET);
+ read_ptr_safe(fd_core, &core, err_close);
+
+ /*
+ * Count the numbers of segments. Each segment
+ * is the VMA record with appropriate permissions.
+ * Then we need one big segment which would hold
+ * all the pages dumped.
+ */
+ lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
+ while(1) {
+ read_ptr_safe(fd_core, &vma, err_close);
+ if (vma.start == 0 && vma.end == 0)
+ break;
+ e_phnum++;
+ }
+
+ while (1) {
+ read_ptr_safe(fd_core, &va, err_close);
+ nrpages++;
+ if (va == 0)
+ break;
+ lseek(fd_core, PAGE_SIZE, SEEK_CUR);
+ }
+
+ /* Figure out if we're overflowed */
+ if (e_phnum > ELF_MAX_PHDR) {
+ pr_error("Too many VMA areas (%li of %li allowed)\n",
+ e_phnum, ELF_MAX_PHDR);
+ goto err_close;
+ } else if (nrpages > ELF_MAX_PAGES) {
+ pr_error("Too many pages to restore (%li of %li allowed)\n",
+ nrpages, ELF_MAX_PAGES);
+ goto err_close;
+ }
+
+ /*
+ * We can write elf header now.
+ */
+ lseek(fd_elf, 0, SEEK_SET);
+ elf_ehdr.e_phnum = e_phnum + 2;
+ elf_ehdr.e_entry = core.gpregs.ip;
+ write_ptr_safe(fd_elf, &elf_ehdr, err_close);
+
+ /* Offset in file (after all headers) */
+ phoff = elf_ehdr.e_phnum * sizeof(elf_phdr) + sizeof(elf_ehdr);
+
+ /* VMAs to headers */
+ e_phnum = 0;
+ lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
+ while(1) {
+ read_ptr_safe(fd_core, &vma, err_close);
+ if (vma.start == 0 && vma.end == 0)
+ break;
+
+ memset(&elf_phdr, 0, sizeof(elf_phdr));
+
+ elf_phdr.p_type = PT_CKPT_VMA;
+ elf_phdr.p_offset = phoff;
+ elf_phdr.p_vaddr = vma.start;
+ elf_phdr.p_paddr = vma.start;
+ elf_phdr.p_filesz = sizeof(vma);
+ elf_phdr.p_memsz = vma.end - vma.start;
+ elf_phdr.p_align = 0x1000;
+
+ if (vma.prot & PROT_READ)
+ elf_phdr.p_flags |= PF_R;
+ if (vma.prot & PROT_WRITE)
+ elf_phdr.p_flags |= PF_W;
+ if (vma.prot & PROT_EXEC)
+ elf_phdr.p_flags |= PF_X;
+
+ write_ptr_safe(fd_elf, &elf_phdr, err_close);
+
+ phoff += sizeof(vma);
+ }
+
+ /* The binfmt header */
+ memset(&elf_phdr, 0, sizeof(elf_phdr));
+
+ elf_phdr.p_type = PT_CKPT_CORE;
+ elf_phdr.p_flags = PF_R;
+ elf_phdr.p_offset = phoff;
+ elf_phdr.p_vaddr = 0;
+ elf_phdr.p_filesz = sizeof(core);
+ elf_phdr.p_memsz = sizeof(core);
+ elf_phdr.p_align = 0x1000;
+
+ write_ptr_safe(fd_elf, &elf_phdr, err_close);
+
+ phoff += sizeof(core);
+
+ /* The pages and binfmt header */
+ memset(&elf_phdr, 0, sizeof(elf_phdr));
+
+ elf_phdr.p_type = PT_CKPT_PAGES;
+ elf_phdr.p_flags = PF_R;
+ elf_phdr.p_offset = phoff;
+ elf_phdr.p_vaddr = 0;
+ elf_phdr.p_filesz = nrpages * (sizeof(page_entry));
+ elf_phdr.p_memsz = nrpages * (sizeof(page_entry));
+ elf_phdr.p_align = 0x1000;
+
+ write_ptr_safe(fd_elf, &elf_phdr, err_close);
+
+ /* Now write real contents for program segments */
+ lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
+ while(1) {
+ read_ptr_safe(fd_core, &vma, err_close);
+ if (vma.start == 0 && vma.end == 0)
+ break;
+ area.vma = vma, pr_info_vma(&area);
+ write_ptr_safe(fd_elf, &vma, err_close);
+ }
+
+ write_ptr_safe(fd_elf, &core, err_close);
+
+ if (sendfile(fd_elf, fd_core, NULL, nrpages * (sizeof(page_entry))) !=
+ nrpages * (sizeof(page_entry))) {
+ pr_perror("Can't send %li bytes to elf\n",
+ (long)(nrpages * (sizeof(page_entry))));
+ goto err;
+ }
+
+ ret = 0;
+
+err_close:
+ close(fd_elf);
+err:
+ return ret;
+}
diff --git a/gen-offsets.sh b/gen-offsets.sh
new file mode 100644
index 000000000..0948aed1c
--- /dev/null
+++ b/gen-offsets.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+name_ifndef=$1
+name_prefix_offset=$2
+name_blob=$3
+name_objname=$4
+name_bin=$5
+
+awk_cmd="{ print \"#define $name_prefix_offset\" \$3 \" 0x\" \$1; }"
+
+echo "/* Autogenerated file, don't edit */"
+echo "#ifndef $name_ifndef"
+echo "#define $name_ifndef"
+echo ""
+nm $name_objname | grep ' [Tt] ' | awk "$awk_cmd"
+echo ""
+echo "static char $name_blob[] = {"
+hexdump -v -e '"\t"' -e '8/1 "0x%02x, "' -e '"\n"' $name_bin
+echo "};"
+echo ""
+echo "#endif /* $name_ifndef */"
+
diff --git a/include/bitops.h b/include/bitops.h
new file mode 100644
index 000000000..c391bd9dd
--- /dev/null
+++ b/include/bitops.h
@@ -0,0 +1,54 @@
+#ifndef CR_BITOPS_H_
+#define CR_BITOPS_H_
+
+#ifdef CONFIG_X86_64
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, 8 * sizeof(long))
+
+#define DECLARE_BITMAP(name, bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
+/* Technically wrong, but this avoids compilation errors on some gcc
+ versions. */
+#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
+#else
+#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#endif
+
+#define ADDR BITOP_ADDR(addr)
+
+static void set_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
+}
+
+static void change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
+}
+
+static int test_bit(int nr, volatile const unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("bt %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+static void clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
+}
+
+
+#else /* CONFIG_X86_64 */
+# error x86-32 is not implemented yet
+#endif /* CONFIG_X86_64 */
+
+#endif /* CR_BITOPS_H_ */
diff --git a/include/compiler.h b/include/compiler.h
new file mode 100644
index 000000000..872428276
--- /dev/null
+++ b/include/compiler.h
@@ -0,0 +1,57 @@
+#ifndef CR_COMPILER_H_
+#define CR_COMPILER_H_
+
+/*
+ * Various definitions for success build,
+ * picked from various places, mostly from
+ * the linux kernel.
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+#define NORETURN __attribute__((__noreturn__))
+#define __packed __attribute__((__packed__))
+#define __used __attribute__((__used__))
+
+#define __section(S) __attribute__ ((__section__(#S)))
+
+#ifndef __always_inline
+# define __always_inline inline __attribute__((always_inline))
+#endif
+
+#ifndef always_inline
+# define always_inline __always_inline
+#endif
+
+#ifndef offsetof
+# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+
+#define min(x, y) ({ \
+ typeof(x) _min1 = (x); \
+ typeof(y) _min2 = (y); \
+ (void) (&_min1 == &_min2); \
+ _min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({ \
+ typeof(x) _max1 = (x); \
+ typeof(y) _max2 = (y); \
+ (void) (&_max1 == &_max2); \
+ _max1 > _max2 ? _max1 : _max2; })
+
+#define is_log2(v) (((v) & ((v) - 1)) == 0)
+
+#endif /* CR_COMPILER_H_ */
diff --git a/include/crtools.h b/include/crtools.h
new file mode 100644
index 000000000..f8af18695
--- /dev/null
+++ b/include/crtools.h
@@ -0,0 +1,105 @@
+#ifndef CRTOOLS_H_
+#define CRTOOLS_H_
+
+#include <sys/types.h>
+
+#include "types.h"
+#include "list.h"
+
+#include "image.h"
+
+extern struct page_entry zero_page_entry;
+
+int cr_dump_tasks(pid_t pid, bool leader_only, int leave_stopped);
+int cr_restore_tasks(pid_t pid, bool leader_only, int leave_stopped);
+int cr_show(unsigned long pid, bool leader_only);
+int convert_to_elf(char *elf_path, int fd_core);
+
+#define CR_FD_PERM 0600
+
+enum {
+ CR_FD_FDINFO,
+ CR_FD_PAGES,
+ CR_FD_PAGES_SHMEM,
+ CR_FD_CORE,
+ CR_FD_PIPES,
+ CR_FD_PSTREE,
+ CR_FD_SHMEM,
+
+ CR_FD_MAX
+};
+
+/* file descriptors template */
+struct cr_fd_desc_tmpl {
+ const char *fmt; /* format for the name */
+ u32 magic; /* magic in the header */
+};
+
+/* file descriptors */
+struct cr_fd_desc {
+ struct cr_fd_desc_tmpl *tmpl; /* template we refer to */
+ char name[64]; /* the name, based on pid */
+ int fd; /* descriptor for open/close */
+};
+
+struct cr_fdset {
+ struct cr_fd_desc desc[CR_FD_MAX];
+ u32 use_mask; /*
+ * if descriptor get used,set
+ * bit here
+ */
+};
+
+#define CR_FD_DESC_USE(type) ((1 << (type)))
+#define CR_FD_DESC_ALL ((1 << CR_FD_MAX) - 1)
+#define CR_FD_DESC_NOPSTREE (CR_FD_DESC_ALL & ~(CR_FD_DESC_USE(CR_FD_PSTREE)))
+#define CR_FD_DESC_NONE (0)
+
+
+struct cr_fdset *alloc_cr_fdset(pid_t pid);
+int prep_cr_fdset_for_dump(struct cr_fdset *cr_fdset,
+ unsigned long use_mask);
+int prep_cr_fdset_for_restore(struct cr_fdset *cr_fdset,
+ unsigned long use_mask);
+void close_cr_fdset(struct cr_fdset *cr_fdset);
+void free_cr_fdset(struct cr_fdset **cr_fdset);
+
+struct vma_area {
+ struct list_head list;
+ struct vma_entry vma;
+ unsigned long shmid;
+ int vm_file_fd;
+};
+
+#define vma_area_has(vma_area, s) vma_entry_has(&vma_area->vma, s)
+#define vma_entry_len(vma) ((vma)->end - (vma)->start)
+
+struct pstree_item {
+ struct list_head list;
+ pid_t pid; /* leader pid */
+ u32 nr_children; /* number of children */
+ u32 *children; /* array of children */
+};
+
+struct pstree_item_info {
+ struct list_head list;
+
+ pid_t pid; /* leader pid */
+ u32 nr_children; /* number of children */
+ u32 *children; /* array of children */
+
+ bool launched; /* set if launched */
+};
+
+static inline unsigned long vma_area_size(struct vma_area *vma)
+{
+ return vma->vma.end - vma->vma.start;
+}
+
+static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
+{
+ return addr >= (unsigned long)vma->vma.start &&
+ addr < (unsigned long)vma->vma.end;
+}
+
+#endif /* CRTOOLS_H_ */
diff --git a/include/elf.h b/include/elf.h
new file mode 100644
index 000000000..96c992d63
--- /dev/null
+++ b/include/elf.h
@@ -0,0 +1,507 @@
+#ifndef CR_ELF_H
+#define CR_ELF_H
+
+#include "types.h"
+
+/* Segment types */
+#define PT_NULL 0
+#define PT_LOAD 1
+#define PT_DYNAMIC 2
+#define PT_INTERP 3
+#define PT_NOTE 4
+#define PT_SHLIB 5
+#define PT_PHDR 6
+#define PT_TLS 7
+#define PT_LOOS 0x60000000
+#define PT_HIOS 0x6fffffff
+#define PT_LOPROC 0x70000000
+#define PT_HIPROC 0x7fffffff
+#define PT_GNU_EH_FRAME 0x6474e550
+
+#define PT_CKPT_OFFSET 0x01010101
+
+#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1)
+#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2)
+#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3)
+
+/* ELF file types */
+#define ET_NONE 0
+#define ET_REL 1
+#define ET_EXEC 2
+#define ET_DYN 3
+#define ET_CORE 4
+#define ET_CKPT 5
+#define ET_LOPROC 0xff00
+#define ET_HIPROC 0xffff
+
+/* ELF machine types */
+#define EM_NONE 0
+#define EM_M32 1
+#define EM_SPARC 2
+#define EM_386 3
+#define EM_68K 4
+#define EM_88K 5
+#define EM_486 6 /* Not used in Linux at least */
+#define EM_860 7
+#define EM_MIPS 8 /* R3k, bigendian(?) */
+#define EM_MIPS_RS4_BE 10 /* R4k BE */
+#define EM_PARISC 15
+#define EM_SPARC32PLUS 18
+#define EM_PPC 20
+#define EM_PPC64 21
+#define EM_S390 22
+#define EM_SH 42
+#define EM_SPARCV9 43 /* v9 = SPARC64 */
+#define EM_H8_300H 47
+#define EM_H8S 48
+#define EM_IA_64 50
+#define EM_X86_64 62
+#define EM_CRIS 76
+#define EM_V850 87
+#define EM_ALPHA 0x9026 /* Interrim Alpha that stuck around */
+#define EM_CYGNUS_V850 0x9080 /* Old v850 ID used by Cygnus */
+#define EM_S390_OLD 0xA390 /* Obsolete interrim value for S/390 */
+
+/* Dynamic type values */
+#define DT_NULL 0
+#define DT_NEEDED 1
+#define DT_PLTRELSZ 2
+#define DT_PLTGOT 3
+#define DT_HASH 4
+#define DT_STRTAB 5
+#define DT_SYMTAB 6
+#define DT_RELA 7
+#define DT_RELASZ 8
+#define DT_RELAENT 9
+#define DT_STRSZ 10
+#define DT_SYMENT 11
+#define DT_INIT 12
+#define DT_FINI 13
+#define DT_SONAME 14
+#define DT_RPATH 15
+#define DT_SYMBOLIC 16
+#define DT_REL 17
+#define DT_RELSZ 18
+#define DT_RELENT 19
+#define DT_PLTREL 20
+#define DT_DEBUG 21
+#define DT_TEXTREL 22
+#define DT_JMPREL 23
+#define DT_LOPROC 0x70000000
+#define DT_HIPROC 0x7fffffff
+
+/* Auxilliary table entries */
+#define AT_NULL 0 /* end of vector */
+#define AT_IGNORE 1 /* entry should be ignored */
+#define AT_EXECFD 2 /* file descriptor of program */
+#define AT_PHDR 3 /* program headers for program */
+#define AT_PHENT 4 /* size of program header entry */
+#define AT_PHNUM 5 /* number of program headers */
+#define AT_PAGESZ 6 /* system page size */
+#define AT_BASE 7 /* base address of interpreter */
+#define AT_FLAGS 8 /* flags */
+#define AT_ENTRY 9 /* entry point of program */
+#define AT_NOTELF 10 /* program is not ELF */
+#define AT_UID 11 /* real uid */
+#define AT_EUID 12 /* effective uid */
+#define AT_GID 13 /* real gid */
+#define AT_EGID 14 /* effective gid */
+#define AT_PLATFORM 15 /* string identifying CPU for optimizations */
+#define AT_HWCAP 16 /* arch dependent hints at CPU capabilities */
+#define AT_CLKTCK 17 /* frequency at which times() increments */
+/* 18..22 = ? */
+#define AT_SECURE 23 /* secure mode boolean */
+
+/* Program header permission flags */
+#define PF_X 0x1
+#define PF_W 0x2
+#define PF_R 0x4
+
+/* Section header types */
+#define SHT_NULL 0
+#define SHT_PROGBITS 1
+#define SHT_SYMTAB 2
+#define SHT_STRTAB 3
+#define SHT_RELA 4
+#define SHT_HASH 5
+#define SHT_DYNAMIC 6
+#define SHT_NOTE 7
+#define SHT_NOBITS 8
+#define SHT_REL 9
+#define SHT_SHLIB 10
+#define SHT_DYNSYM 11
+#define SHT_NUM 12
+#define SHT_LOPROC 0x70000000
+#define SHT_HIPROC 0x7fffffff
+#define SHT_LOUSER 0x80000000
+#define SHT_HIUSER 0xffffffff
+
+/* Section header flags */
+#define SHF_WRITE (1 << 0) /* Writable */
+#define SHF_ALLOC (1 << 1) /* Occupies memory during execution */
+#define SHF_EXECINSTR (1 << 2) /* Executable */
+#define SHF_MERGE (1 << 4) /* Might be merged */
+#define SHF_STRINGS (1 << 5) /* Contains nul-terminated strings */
+#define SHF_INFO_LINK (1 << 6) /* `sh_info' contains SHT index */
+#define SHF_LINK_ORDER (1 << 7) /* Preserve order after combining */
+#define SHF_OS_NONCONFORMING (1 << 8) /* Non-standard OS specific handling required */
+#define SHF_GROUP (1 << 9) /* Section is member of a group. */
+#define SHF_TLS (1 << 10) /* Section hold thread-local data. */
+
+/* Special section numbers */
+#define SHN_UNDEF 0
+#define SHN_LORESERVE 0xff00
+#define SHN_LOPROC 0xff00
+#define SHN_HIPROC 0xff1f
+#define SHN_ABS 0xfff1
+#define SHN_COMMON 0xfff2
+#define SHN_HIRESERVE 0xffff
+
+/* Section align flag */
+#define SHA_ANY 1 /* No alignment constraint */
+
+/* Lenght of magic at the start of a file */
+#define EI_NIDENT 16
+
+/* Magic number constants... */
+#define EI_MAG0 0 /* e_ident[] indexes */
+#define EI_MAG1 1
+#define EI_MAG2 2
+#define EI_MAG3 3
+#define EI_CLASS 4
+#define EI_DATA 5
+#define EI_VERSION 6
+#define EI_OSABI 7
+#define EI_PAD 8
+
+#define ELFMAG0 0x7f /* EI_MAG */
+#define ELFMAG1 'E'
+#define ELFMAG2 'L'
+#define ELFMAG3 'F'
+#define ELFMAG "\177ELF"
+#define SELFMAG 4
+
+#define ELFCLASSNONE 0 /* EI_CLASS */
+#define ELFCLASS32 1
+#define ELFCLASS64 2
+#define ELFCLASSNUM 3
+
+#define ELFDATANONE 0 /* e_ident[EI_DATA] */
+#define ELFDATA2LSB 1
+#define ELFDATA2MSB 2
+
+#define EV_NONE 0 /* e_version, EI_VERSION */
+#define EV_CURRENT 1
+#define EV_NUM 2
+
+#define ELFOSABI_NONE 0
+#define ELFOSABI_LINUX 3
+
+/* Legal values for ST_BIND subfield of st_info (symbol binding). */
+#define STB_LOCAL 0 /* Local symbol */
+#define STB_GLOBAL 1 /* Global symbol */
+#define STB_WEAK 2 /* Weak symbol */
+#define STB_NUM 3 /* Number of defined types. */
+#define STB_LOOS 10 /* Start of OS-specific */
+#define STB_HIOS 12 /* End of OS-specific */
+#define STB_LOPROC 13 /* Start of processor-specific */
+#define STB_HIPROC 15 /* End of processor-specific */
+
+/* Symbol types */
+#define STT_NOTYPE 0 /* Symbol type is unspecified */
+#define STT_OBJECT 1 /* Symbol is a data object */
+#define STT_FUNC 2 /* Symbol is a code object */
+#define STT_SECTION 3 /* Symbol associated with a section */
+#define STT_FILE 4 /* Symbol's name is file name */
+#define STT_COMMON 5 /* Symbol is a common data object */
+#define STT_TLS 6 /* Symbol is thread-local data object */
+#define STT_NUM 7 /* Number of defined types. */
+
+/* Symbol visibilities */
+#define STV_DEFAULT 0 /* Default symbol visibility rules */
+#define STV_INTERNAL 1 /* Processor specific hidden class */
+#define STV_HIDDEN 2 /* Sym unavailable in other modules */
+#define STV_PROTECTED 3 /* Not preemptible, not exported */
+
+/* Both Elf32_Sym and Elf64_Sym use the same one-byte st_info field */
+#define ELF32_ST_BIND(i) ((i) >> 4)
+#define ELF32_ST_MKBIND(i) ((i) << 4) /* just a helper */
+#define ELF32_ST_TYPE(i) ((i) & 0xf)
+#define ELF32_ST_INFO(b, i) (ELF_ST_MKBIND(b) + ELF_ST_TYPE(i))
+
+#define ELF64_ST_BIND(i) ELF32_ST_BIND(i)
+#define ELF64_ST_MKBIND(i) ELF32_ST_MKBIND(i)
+#define ELF64_ST_TYPE(i) ELF32_ST_TYPE(i)
+#define ELF64_ST_INFO(b, i) ELF32_ST_INFO(b, i)
+
+/*
+ * ELF standard typedefs (yet more proof that <stdint.h> was way overdue)
+ */
+
+typedef u16 Elf32_Half;
+typedef s16 Elf32_SHalf;
+typedef u32 Elf32_Word;
+typedef s32 Elf32_Sword;
+typedef u64 Elf32_Xword;
+typedef s64 Elf32_Sxword;
+
+typedef u32 Elf32_Off;
+typedef u32 Elf32_Addr;
+typedef u16 Elf32_Section;
+
+typedef u16 Elf64_Half;
+typedef s16 Elf64_SHalf;
+typedef u32 Elf64_Word;
+typedef s32 Elf64_Sword;
+typedef u64 Elf64_Xword;
+typedef s64 Elf64_Sxword;
+
+typedef u64 Elf64_Off;
+typedef u64 Elf64_Addr;
+typedef u16 Elf64_Section;
+
+/*
+ * Dynamic header
+ */
+
+typedef struct elf32_dyn {
+ Elf32_Sword d_tag;
+ union {
+ Elf32_Sword d_val;
+ Elf32_Addr d_ptr;
+ } d_un;
+} Elf32_Dyn;
+
+typedef struct elf64_dyn {
+ Elf64_Sxword d_tag;
+ union {
+ Elf64_Xword d_val;
+ Elf64_Addr d_ptr;
+ } d_un;
+} Elf64_Dyn;
+
+/*
+ * Relocations
+ */
+
+#define ELF32_R_SYM(x) ((x) >> 8)
+#define ELF32_R_TYPE(x) ((x) & 0xff)
+
+typedef struct elf32_rel {
+ Elf32_Addr r_offset;
+ Elf32_Word r_info;
+} Elf32_Rel;
+
+typedef struct elf32_rela {
+ Elf32_Addr r_offset;
+ Elf32_Word r_info;
+ Elf32_Sword r_addend;
+} Elf32_Rela;
+
+enum reloc32_type {
+ R_386_32 = 1, /* ordinary absolute relocation */
+ R_386_PC32 = 2, /* PC-relative relocation */
+ R_386_GOT32 = 3, /* an offset into GOT */
+ R_386_PLT32 = 4, /* a PC-relative offset into PLT */
+ R_386_COPY = 5, /* ??? */
+ R_386_GLOB_DAT = 6, /* ??? */
+ R_386_JUMP_SLOT = 7, /* ??? */
+ R_386_RELATIVE = 8, /* ??? */
+ R_386_GOTOFF = 9, /* an offset from GOT base */
+ R_386_GOTPC = 10, /* a PC-relative offset _to_ GOT */
+ R_386_TLS_TPOFF = 14, /* Offset in static TLS block */
+ R_386_TLS_IE = 15, /* Address of GOT entry for static TLS block offset */
+
+ /* These are GNU extensions, but useful */
+ R_386_16 = 20, /* A 16-bit absolute relocation */
+ R_386_PC16 = 21, /* A 16-bit PC-relative relocation */
+ R_386_8 = 22, /* An 8-bit absolute relocation */
+ R_386_PC8 = 23 /* An 8-bit PC-relative relocation */
+};
+
+#define ELF64_R_SYM(x) ((x) >> 32)
+#define ELF64_R_TYPE(x) ((x) & 0xffffffff)
+
+typedef struct elf64_rel {
+ Elf64_Addr r_offset;
+ Elf64_Xword r_info;
+} Elf64_Rel;
+
+typedef struct elf64_rela {
+ Elf64_Addr r_offset;
+ Elf64_Xword r_info;
+ Elf64_Sxword r_addend;
+} Elf64_Rela;
+
+enum reloc64_type {
+ R_X86_64_NONE = 0, /* No reloc */
+ R_X86_64_64 = 1, /* Direct 64 bit */
+ R_X86_64_PC32 = 2, /* PC relative 32 bit signed */
+ R_X86_64_GOT32 = 3, /* 32 bit GOT entry */
+ R_X86_64_PLT32 = 4, /* 32 bit PLT address */
+ R_X86_64_COPY = 5, /* Copy symbol at runtime */
+ R_X86_64_GLOB_DAT = 6, /* Create GOT entry */
+ R_X86_64_JUMP_SLOT = 7, /* Create PLT entry */
+ R_X86_64_RELATIVE = 8, /* Adjust by program base */
+ R_X86_64_GOTPCREL = 9, /* 32 bit signed PC relative offset to GOT */
+ R_X86_64_32 = 10, /* Direct 32 bit zero extended */
+ R_X86_64_32S = 11, /* Direct 32 bit sign extended */
+ R_X86_64_16 = 12, /* Direct 16 bit zero extended */
+ R_X86_64_PC16 = 13, /* 16 bit sign extended pc relative */
+ R_X86_64_8 = 14, /* Direct 8 bit sign extended */
+ R_X86_64_PC8 = 15, /* 8 bit sign extended pc relative */
+ R_X86_64_DTPMOD64 = 16, /* ID of module containing symbol */
+ R_X86_64_DTPOFF64 = 17, /* Offset in module's TLS block */
+ R_X86_64_TPOFF64 = 18, /* Offset in initial TLS block */
+ R_X86_64_TLSGD = 19, /* 32 bit signed PC relative offset to two GOT entries for GD symbol */
+ R_X86_64_TLSLD = 20, /* 32 bit signed PC relative offset to two GOT entries for LD symbol */
+ R_X86_64_DTPOFF32 = 21, /* Offset in TLS block */
+ R_X86_64_GOTTPOFF = 22, /* 32 bit signed PC relative offset to GOT entry for IE symbol */
+ R_X86_64_TPOFF32 = 23, /* Offset in initial TLS block */
+ R_X86_64_PC64 = 24, /* word64 S + A - P */
+ R_X86_64_GOTOFF64 = 25, /* word64 S + A - GOT */
+ R_X86_64_GOTPC32 = 26, /* word32 GOT + A - P */
+ R_X86_64_GOT64 = 27, /* word64 G + A */
+ R_X86_64_GOTPCREL64 = 28,/* word64 G + GOT - P + A */
+ R_X86_64_GOTPC64 = 29, /* word64 GOT - P + A */
+ R_X86_64_GOTPLT64 = 30, /* word64 G + A */
+ R_X86_64_PLTOFF64 = 31, /* word64 L - GOT + A */
+ R_X86_64_SIZE32 = 32, /* word32 Z + A */
+ R_X86_64_SIZE64 = 33, /* word64 Z + A */
+ R_X86_64_GOTPC32_TLSDESC = 34, /* word32 */
+ R_X86_64_TLSDESC_CALL = 35, /* none */
+ R_X86_64_TLSDESC = 36 /* word64?2 */
+};
+
+/*
+ * Symbol
+ */
+
+typedef struct elf32_sym {
+ Elf32_Word st_name;
+ Elf32_Addr st_value;
+ Elf32_Word st_size;
+ unsigned char st_info;
+ unsigned char st_other;
+ Elf32_Half st_shndx;
+} Elf32_Sym;
+
+typedef struct elf64_sym {
+ Elf64_Word st_name;
+ unsigned char st_info;
+ unsigned char st_other;
+ Elf64_Half st_shndx;
+ Elf64_Addr st_value;
+ Elf64_Xword st_size;
+} Elf64_Sym;
+
+/*
+ * Main file header
+ */
+
+typedef struct elf32_hdr {
+ unsigned char e_ident[EI_NIDENT];
+ Elf32_Half e_type;
+ Elf32_Half e_machine;
+ Elf32_Word e_version;
+ Elf32_Addr e_entry;
+ Elf32_Off e_phoff;
+ Elf32_Off e_shoff;
+ Elf32_Word e_flags;
+ Elf32_Half e_ehsize;
+ Elf32_Half e_phentsize;
+ Elf32_Half e_phnum;
+ Elf32_Half e_shentsize;
+ Elf32_Half e_shnum;
+ Elf32_Half e_shstrndx;
+} Elf32_Ehdr;
+
+typedef struct elf64_hdr {
+ unsigned char e_ident[EI_NIDENT];
+ Elf64_Half e_type;
+ Elf64_Half e_machine;
+ Elf64_Word e_version;
+ Elf64_Addr e_entry;
+ Elf64_Off e_phoff;
+ Elf64_Off e_shoff;
+ Elf64_Word e_flags;
+ Elf64_Half e_ehsize;
+ Elf64_Half e_phentsize;
+ Elf64_Half e_phnum;
+ Elf64_Half e_shentsize;
+ Elf64_Half e_shnum;
+ Elf64_Half e_shstrndx;
+} Elf64_Ehdr;
+
+/*
+ * Program header
+ */
+
+typedef struct elf32_phdr {
+ Elf32_Word p_type;
+ Elf32_Off p_offset;
+ Elf32_Addr p_vaddr;
+ Elf32_Addr p_paddr;
+ Elf32_Word p_filesz;
+ Elf32_Word p_memsz;
+ Elf32_Word p_flags;
+ Elf32_Word p_align;
+} Elf32_Phdr;
+
+typedef struct elf64_phdr {
+ Elf64_Word p_type;
+ Elf64_Word p_flags;
+ Elf64_Off p_offset;
+ Elf64_Addr p_vaddr;
+ Elf64_Addr p_paddr;
+ Elf64_Xword p_filesz;
+ Elf64_Xword p_memsz;
+ Elf64_Xword p_align;
+} Elf64_Phdr;
+
+/*
+ * Section headers.
+ */
+
+typedef struct elf32_shdr {
+ Elf32_Word sh_name;
+ Elf32_Word sh_type;
+ Elf32_Word sh_flags;
+ Elf32_Addr sh_addr;
+ Elf32_Off sh_offset;
+ Elf32_Word sh_size;
+ Elf32_Word sh_link;
+ Elf32_Word sh_info;
+ Elf32_Word sh_addralign;
+ Elf32_Word sh_entsize;
+} Elf32_Shdr;
+
+typedef struct elf64_shdr {
+ Elf64_Word sh_name;
+ Elf64_Word sh_type;
+ Elf64_Xword sh_flags;
+ Elf64_Addr sh_addr;
+ Elf64_Off sh_offset;
+ Elf64_Xword sh_size;
+ Elf64_Word sh_link;
+ Elf64_Word sh_info;
+ Elf64_Xword sh_addralign;
+ Elf64_Xword sh_entsize;
+} Elf64_Shdr;
+
+/*
+ * Note header
+ */
+typedef struct elf32_note {
+ Elf32_Word n_namesz; /* Name size */
+ Elf32_Word n_descsz; /* Content size */
+ Elf32_Word n_type; /* Content type */
+} Elf32_Nhdr;
+
+typedef struct elf64_note {
+ Elf64_Word n_namesz; /* Name size */
+ Elf64_Word n_descsz; /* Content size */
+ Elf64_Word n_type; /* Content type */
+} Elf64_Nhdr;
+
+#endif /* CR_ELF_H */
diff --git a/include/image.h b/include/image.h
new file mode 100644
index 000000000..956a1f55d
--- /dev/null
+++ b/include/image.h
@@ -0,0 +1,191 @@
+#ifndef CR_IMAGE_H
+#define CR_IMAGE_H
+
+#include "types.h"
+#include "compiler.h"
+
+#define FDINFO_MAGIC 0x01010101
+#define PAGES_MAGIC 0x20202020
+#define CORE_MAGIC 0xa75b8d43
+#define SHMEM_MAGIC 0x03300330
+#define PIPEFS_MAGIC 0x50495045
+#define PSTREE_MAGIC 0x40044004
+#define PIPES_MAGIC 0x05055050
+
+#define FDINFO_FD 1
+#define FDINFO_MAP 2
+
+#define PAGE_IMAGE_SIZE 4096
+#define PAGE_RSS 1
+
+struct fdinfo_entry {
+ u8 type;
+ u8 len;
+ u16 flags;
+ u32 pos;
+ u64 addr;
+ u8 name[0];
+} __packed;
+
+struct shmem_entry {
+ u64 start;
+ u64 end;
+ u64 shmid;
+} __packed;
+
+struct pstree_entry {
+ u32 pid;
+ u32 nr_children;
+ u32 children[0];
+} __packed;
+
+struct pipe_entry {
+ u32 fd;
+ u32 pipeid;
+ u32 flags;
+ u32 bytes;
+ u8 data[0];
+} __packed;
+
+#define VMA_AREA_REGULAR (1 << 0)
+#define VMA_AREA_STACK (1 << 1)
+#define VMA_AREA_VSYSCALL (1 << 2)
+#define VMA_AREA_VDSO (1 << 3)
+#define VMA_FORCE_READ (1 << 4)
+#define VMA_AREA_HEAP (1 << 5)
+#define VMA_FILE_PRIVATE (1 << 6)
+#define VMA_FILE_SHARED (1 << 7)
+#define VMA_ANON_SHARED (1 << 8)
+#define VMA_ANON_PRIVATE (1 << 9)
+#define VMA_FORCE_WRITE (1 << 10)
+#define VMA_DUMP_ALL (1 << 11)
+
+#define vma_entry_has(vma, s) (((vma)->status & (s)) == (s))
+
+struct vma_entry {
+ u64 start;
+ u64 end;
+ u64 pgoff;
+ u32 prot;
+ u32 flags;
+ u32 status;
+ u32 pid;
+ s64 fd;
+ u64 ino;
+ u32 dev_maj;
+ u32 dev_min;
+} __packed;
+
+struct page_entry {
+ u64 va;
+ u8 data[PAGE_IMAGE_SIZE];
+} __packed;
+
+#define HEADER_VERSION 1
+#define HEADER_ARCH_X86_64 1
+
+struct image_header {
+ u16 version;
+ u16 arch;
+ u32 flags;
+} __packed;
+
+/*
+ * PTRACE_GETREGS
+ * PTRACE_GETFPREGS
+ * PTRACE_GETFPXREGS dep CONFIG_X86_32
+ * PTRACE_GET_THREAD_AREA dep CONFIG_X86_32 || CONFIG_IA32_EMULATION
+ * PTRACE_GETFDPIC dep CONFIG_BINFMT_ELF_FDPIC
+ *
+ * PTRACE_ARCH_PRCTL dep CONFIG_X86_64
+ * ARCH_SET_GS/ARCH_GET_FS
+ * ARCH_SET_FS/ARCH_GET_GS
+ */
+
+#ifdef CONFIG_X86_64
+
+struct user_regs_entry {
+ u64 r15;
+ u64 r14;
+ u64 r13;
+ u64 r12;
+ u64 bp;
+ u64 bx;
+ u64 r11;
+ u64 r10;
+ u64 r9;
+ u64 r8;
+ u64 ax;
+ u64 cx;
+ u64 dx;
+ u64 si;
+ u64 di;
+ u64 orig_ax;
+ u64 ip;
+ u64 cs;
+ u64 flags;
+ u64 sp;
+ u64 ss;
+ u64 fs_base;
+ u64 gs_base;
+ u64 ds;
+ u64 es;
+ u64 fs;
+ u64 gs;
+} __packed;
+
+struct desc_struct {
+ union {
+ struct {
+ u32 a;
+ u32 b;
+ };
+ struct {
+ u16 limit0;
+ u16 base0;
+ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+ };
+ };
+} __packed;
+
+struct user_fpregs_entry {
+ u16 cwd;
+ u16 swd;
+ u16 twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ u16 fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ u32 padding[24];
+} __packed;
+
+#define GDT_ENTRY_TLS_ENTRIES 3
+
+struct core_entry {
+ struct image_header hdr;
+ struct user_regs_entry gpregs;
+ struct user_fpregs_entry fpregs;
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+ u32 personality;
+} __packed;
+
+#endif /* CONFIG_X86_64 */
+
+#ifndef offsetof
+# define offsetof(TYPE, MEMBER) ((long) &((TYPE *)0)->MEMBER)
+#endif
+
+/*
+ * There are always 4 magic bytes at the
+ * beginning of the every file.
+ */
+#define MAGIC_OFFSET (sizeof(u32))
+#define GET_FILE_OFF(s, m) (offsetof(s,m) + MAGIC_OFFSET)
+#define GET_FILE_OFF_AFTER(s) (sizeof(s) + MAGIC_OFFSET)
+
+#endif /* CR_IMAGE_H */
diff --git a/include/list.h b/include/list.h
new file mode 100644
index 000000000..8a6931643
--- /dev/null
+++ b/include/list.h
@@ -0,0 +1,286 @@
+#ifndef CR_LIST_H_
+#define CR_LIST_H_
+
+/*
+ * Double linked lists.
+ */
+
+#include "compiler.h"
+
+#define POISON_POINTER_DELTA 0
+#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA)
+#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA)
+
+struct list_head {
+ struct list_head *prev, *next;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void __list_del_entry(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+}
+
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+static inline void list_replace(struct list_head *old,
+ struct list_head *new)
+{
+ new->next = old->next;
+ new->next->prev = new;
+ new->prev = old->prev;
+ new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+ struct list_head *new)
+{
+ list_replace(old, new);
+ INIT_LIST_HEAD(old);
+}
+
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del_entry(entry);
+ INIT_LIST_HEAD(entry);
+}
+
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add(list, head);
+}
+
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add_tail(list, head);
+}
+
+static inline int list_is_last(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->next == head;
+}
+
+static inline int list_is_first(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->prev == head;
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline int list_empty_careful(const struct list_head *head)
+{
+ struct list_head *next = head->next;
+ return (next == head) && (next == head->prev);
+}
+static inline void list_rotate_left(struct list_head *head)
+{
+ struct list_head *first;
+
+ if (!list_empty(head)) {
+ first = head->next;
+ list_move_tail(first, head);
+ }
+}
+
+static inline int list_is_singular(const struct list_head *head)
+{
+ return !list_empty(head) && (head->next == head->prev);
+}
+
+static inline void __list_cut_position(struct list_head *list,
+ struct list_head *head, struct list_head *entry)
+{
+ struct list_head *new_first = entry->next;
+ list->next = head->next;
+ list->next->prev = list;
+ list->prev = entry;
+ entry->next = list;
+ head->next = new_first;
+ new_first->prev = head;
+}
+
+static inline void list_cut_position(struct list_head *list,
+ struct list_head *head, struct list_head *entry)
+{
+ if (list_empty(head))
+ return;
+ if (list_is_singular(head) &&
+ (head->next != entry && head != entry))
+ return;
+ if (entry == head)
+ INIT_LIST_HEAD(list);
+ else
+ __list_cut_position(list, head, entry);
+}
+
+static inline void __list_splice(const struct list_head *list,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+
+ first->prev = prev;
+ prev->next = first;
+
+ last->next = next;
+ next->prev = last;
+}
+
+static inline void list_splice(const struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head, head->next);
+}
+
+static inline void list_splice_tail(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head->prev, head);
+}
+
+static inline void list_splice_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head, head->next);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+static inline void list_splice_tail_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head->prev, head);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define list_first_entry(ptr, type, member) \
+ list_entry((ptr)->next, type, member)
+
+#define list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define __list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define list_for_each_prev(pos, head) \
+ for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+#define list_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+#define list_for_each_prev_safe(pos, n, head) \
+ for (pos = (head)->prev, n = pos->prev; \
+ pos != (head); \
+ pos = n, n = pos->prev)
+
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_reverse(pos, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+#define list_prepare_entry(pos, head, member) \
+ ((pos) ? : list_entry(head, typeof(*pos), member))
+
+#define list_for_each_entry_continue(pos, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_continue_reverse(pos, head, member) \
+ for (pos = list_entry(pos->member.prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+#define list_for_each_entry_from(pos, head, member) \
+ for (; &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#define list_for_each_entry_safe_continue(pos, n, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#define list_for_each_entry_safe_from(pos, n, head, member) \
+ for (n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#define list_for_each_entry_safe_reverse(pos, n, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member), \
+ n = list_entry(pos->member.prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+#define list_safe_reset_next(pos, n, member) \
+ n = list_entry(pos->member.next, typeof(*pos), member)
+
+#endif /* CR_LIST_H_ */
diff --git a/include/parasite-syscall.h b/include/parasite-syscall.h
new file mode 100644
index 000000000..85b2ad55c
--- /dev/null
+++ b/include/parasite-syscall.h
@@ -0,0 +1,46 @@
+#ifndef PARASITE_SYSCALL_H_
+#define PARASITE_SYSCALL_H_
+
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include "compiler.h"
+#include "types.h"
+#include "list.h"
+#include "crtools.h"
+
+#define BUILTIN_SYSCALL_SIZE 8
+
+/* parasite control block */
+struct parasite_ctl {
+ pid_t pid; /* process where we live */
+ struct vma_area *vma_area; /* our space */
+ unsigned long parasite_ip; /* service routine start ip */
+ unsigned long parasite_complete_ip; /* where we end execution */
+ unsigned long addr_cmd; /* addr for command */
+ unsigned long addr_args; /* address for arguments */
+};
+
+int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end);
+
+void *mmap_seized(pid_t pid, user_regs_struct_t *regs,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+
+int munmap_seized(pid_t pid, user_regs_struct_t *regs,
+ void *addr, size_t length);
+int kill_seized(pid_t pid, user_regs_struct_t *where);
+
+
+int syscall_seized(pid_t pid,
+ user_regs_struct_t *where,
+ user_regs_struct_t *params,
+ user_regs_struct_t *result);
+
+int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list,
+ struct cr_fdset *cr_fdset, int fd_type);
+
+struct parasite_ctl *parasite_infect_seized(pid_t pid, void *addr_hint, struct list_head *vma_area_list);
+int parasite_cure_seized(struct parasite_ctl **p_ctrl, struct list_head *vma_area_list);
+
+#endif /* PARASITE_SYSCALL_H_ */
diff --git a/include/parasite.h b/include/parasite.h
new file mode 100644
index 000000000..97d4dfc05
--- /dev/null
+++ b/include/parasite.h
@@ -0,0 +1,68 @@
+#ifndef CR_PARASITE_H_
+#define CR_PARASITE_H_
+
+#include "compiler.h"
+#include "syscall.h"
+#include "image.h"
+
+#define __parasite_head __used __section(.parasite.head.text)
+#define __parasite_text __used __section(.parasite.text)
+#define __parasite_stack __used __section(.parasite.stack)
+
+#define PARASITE_STACK_SIZE 2048
+#define PARASITE_ARG_SIZE 256
+#define PARASITE_BRK_SIZE 32768
+
+#define PARASITE_MAX_SIZE (64 << 10)
+
+/* we need own error code for diagnostics */
+#define PARASITE_ERR_FAIL -1024
+#define PARASITE_ERR_OPEN -1025
+#define PARASITE_ERR_MMAP -1026
+#define PARASITE_ERR_MINCORE -1027
+#define PARASITE_ERR_MUNMAP -1028
+#define PARASITE_ERR_CLOSE -1029
+#define PARASITE_ERR_WRITE -1030
+#define PARASITE_ERR_MPROTECT -1031
+#define PARASITE_ERR_CORE_VMA -1032
+#define PARASITE_ERR_CORE_PAGE -1033
+
+enum {
+ PARASITE_CMD_NONE,
+ PARASITE_CMD_KILLME,
+ PARASITE_CMD_PINGME,
+ PARASITE_CMD_DUMPPAGES,
+ PARASITE_CMD_RESTORECORE,
+
+ PARASITE_CMD_MAX,
+};
+
+typedef struct {
+ unsigned long command;
+ unsigned long args_size;
+ void *args;
+} parasite_args_t;
+
+typedef struct {
+ struct vma_entry vma_entry;
+ unsigned long nrpages_dumped; /* how many pages are dumped */
+ unsigned long fd;
+ unsigned long open_mode;
+ unsigned long open_flags;
+ char open_path[64];
+} parasite_args_cmd_dumppages_t;
+
+/*
+ * Some useful offsets
+ */
+
+#define PARASITE_ARGS_ADDR(start) \
+ ((start) + parasite_blob_offset__parasite_args)
+#define PARASITE_CMD_ADDR(start) \
+ ((start) + parasite_blob_offset__parasite_cmd)
+#define PARASITE_HEAD_ADDR(start) \
+ ((start) + parasite_blob_offset__parasite_head_start)
+#define PARASITE_COMPLETE_ADDR(start) \
+ ((start) + parasite_blob_offset__parasite_service_complete)
+
+#endif /* CR_PARASITE_H_ */
diff --git a/include/rbtree.h b/include/rbtree.h
new file mode 100644
index 000000000..af8e51cc0
--- /dev/null
+++ b/include/rbtree.h
@@ -0,0 +1,79 @@
+/*
+ * RBtree implementation adopted from the Linux
+ * kernel sources.
+ */
+
+#ifndef _LINUX_RBTREE_H
+#define _LINUX_RBTREE_H
+
+#include <stddef.h>
+
+#define RB_RED 0
+#define RB_BLACK 1
+#define RB_COLOR_MASK 3
+
+struct rb_node {
+ unsigned long rb_parent_color;
+ struct rb_node *rb_right;
+ struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+
+struct rb_root {
+ struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_COLOR_MASK))
+#define rb_color(r) ((r)->rb_parent_color & RB_BLACK)
+#define rb_is_red(r) (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0)
+#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+ rb->rb_parent_color = (rb->rb_parent_color & RB_COLOR_MASK) |(unsigned long)p;
+}
+
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+ rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color;
+}
+
+#define RB_ROOT (struct rb_root) { NULL, }
+#define rb_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
+#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
+
+static inline void rb_init_node(struct rb_node *rb)
+{
+ rb->rb_parent_color = 0;
+ rb->rb_right = NULL;
+ rb->rb_left = NULL;
+ RB_CLEAR_NODE(rb);
+}
+
+void rb_insert_color(struct rb_node *, struct rb_root *);
+void rb_erase(struct rb_node *, struct rb_root *);
+
+struct rb_node *rb_next(const struct rb_node *node);
+struct rb_node *rb_prev(const struct rb_node *node);
+struct rb_node *rb_first(const struct rb_root *node);
+struct rb_node *rb_last(const struct rb_root *node);
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ node->rb_parent_color = (unsigned long )parent;
+ node->rb_left = node->rb_right = NULL;
+
+ *rb_link = node;
+}
+
+#endif /* _LINUX_RBTREE_H */
diff --git a/include/syscall.h b/include/syscall.h
new file mode 100644
index 000000000..6691171fd
--- /dev/null
+++ b/include/syscall.h
@@ -0,0 +1,181 @@
+#ifndef CR_SYSCALL_H_
+#define CR_SYSCALL_H_
+
+#include <sys/types.h>
+
+#include "compiler.h"
+
+#ifdef CONFIG_X86_64
+
+static long syscall0(int nr)
+{
+ long ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr)
+ : "memory");
+ return ret;
+}
+
+static long syscall1(int nr, unsigned long arg0)
+{
+ long ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr), "D" (arg0)
+ : "memory");
+ return ret;
+}
+
+static long syscall2(int nr, unsigned long arg0, unsigned long arg1)
+{
+ long ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr), "D" (arg0), "S" (arg1)
+ : "memory");
+ return ret;
+}
+
+static long syscall3(int nr, unsigned long arg0, unsigned long arg1,
+ unsigned long arg2)
+{
+ long ret;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
+ : "memory");
+ return ret;
+}
+
+static long syscall4(int nr, unsigned long arg0, unsigned long arg1,
+ unsigned long arg2, unsigned long arg3)
+{
+ register unsigned long r10 asm("r10") = r10;
+ long ret;
+
+ r10 = arg3;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
+ : "memory");
+ return ret;
+}
+
+static long syscall5(int nr, unsigned long arg0, unsigned long arg1,
+ unsigned long arg2, unsigned long arg3,
+ unsigned long arg4)
+{
+ register unsigned long r10 asm("r10") = r10;
+ register unsigned long r8 asm("r8") = r8;
+ long ret;
+
+ r10 = arg3;
+ r8 = arg4;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
+ : "memory");
+ return ret;
+}
+
+static long syscall6(int nr, unsigned long arg0, unsigned long arg1,
+ unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ register unsigned long r10 asm("r10") = r10;
+ register unsigned long r8 asm("r8") = r8;
+ register unsigned long r9 asm("r9") = r9;
+ long ret;
+
+ r10 = arg3;
+ r8 = arg4;
+ r9 = arg5;
+ asm volatile("syscall"
+ : "=a" (ret)
+ : "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
+ : "memory");
+ return ret;
+}
+
+/*
+ * syscall codes
+ */
+#define __NR_read 0
+#define __NR_write 1
+#define __NR_open 2
+#define __NR_close 3
+#define __NR_lseek 8
+#define __NR_mmap 9
+#define __NR_mprotect 10
+#define __NR_munmap 11
+#define __NR_mincore 27
+#define __NR_dup 32
+#define __NR_dup2 33
+#define __NR_pause 34
+#define __NR_nanosleep 35
+#define __NR_getpid 39
+#define __NR_exit 60
+
+static unsigned long sys_pause(void)
+{
+ return syscall0(__NR_pause);
+}
+
+static unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long fd, unsigned long offset)
+{
+ return syscall6(__NR_mmap, (unsigned long)addr,
+ len, prot, flags, fd, offset);
+}
+
+static unsigned long sys_munmap(void *addr,unsigned long len)
+{
+ return syscall2(__NR_munmap, (unsigned long)addr, len);
+}
+
+static long sys_open(const char *filename, unsigned long flags, unsigned long mode)
+{
+ return syscall3(__NR_open, (unsigned long)filename, flags, mode);
+}
+
+static long sys_close(int fd)
+{
+ return syscall1(__NR_close, fd);
+}
+
+static long sys_write(unsigned long fd, const void *buf, unsigned long count)
+{
+ return syscall3(__NR_write, fd, (unsigned long)buf, count);
+}
+
+static long sys_mincore(unsigned long addr, unsigned long size, void *vec)
+{
+ return syscall3(__NR_mincore, addr, size, (unsigned long)vec);
+}
+
+static long sys_lseek(unsigned long fd, unsigned long offset, unsigned long origin)
+{
+ return syscall3(__NR_lseek, fd, offset, origin);
+}
+
+static long sys_mprotect(unsigned long start, unsigned long len, unsigned long prot)
+{
+ return syscall3(__NR_mprotect, start, len, prot);
+}
+
+static long sys_nanosleep(struct timespec *req, struct timespec *rem)
+{
+ return syscall2(__NR_nanosleep, (unsigned long)req, (unsigned long)rem);
+}
+
+static long sys_read(unsigned long fd, void *buf, unsigned long count)
+{
+ return syscall3(__NR_read, fd, (unsigned long)buf, count);
+}
+
+#else /* CONFIG_X86_64 */
+# error x86-32 bit mode not yet implemented
+#endif /* CONFIG_X86_64 */
+
+#endif /* CR_SYSCALL_H_ */
diff --git a/include/types.h b/include/types.h
new file mode 100644
index 000000000..5e636df30
--- /dev/null
+++ b/include/types.h
@@ -0,0 +1,132 @@
+#ifndef CR_TYPES_H_
+#define CR_TYPES_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "bitops.h"
+
+/* some constants for ptrace */
+#define PTRACE_SEIZE 0x4206
+#define PTRACE_INTERRUPT 0x4207
+#define PTRACE_LISTEN 0x4208
+
+#define PTRACE_SEIZE_DEVEL 0x80000000
+
+#define PTRACE_EVENT_FORK 1
+#define PTRACE_EVENT_VFORK 2
+#define PTRACE_EVENT_CLONE 3
+#define PTRACE_EVENT_EXEC 4
+#define PTRACE_EVENT_VFORK_DONE 5
+#define PTRACE_EVENT_EXIT 6
+#define PTRACE_EVENT_STOP 7
+
+#define PTRACE_O_TRACESYSGOOD 0x00000001
+#define PTRACE_O_TRACEFORK 0x00000002
+#define PTRACE_O_TRACEVFORK 0x00000004
+#define PTRACE_O_TRACECLONE 0x00000008
+#define PTRACE_O_TRACEEXEC 0x00000010
+#define PTRACE_O_TRACEVFORKDONE 0x00000020
+#define PTRACE_O_TRACEEXIT 0x00000040
+
+/* fcntl */
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#ifndef F_SETPIPE_SZ
+# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
+#endif
+#ifndef F_GETPIPE_SZ
+# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
+#endif
+
+#define CLONE_CHILD_USEPID 0x02000000
+#define CLONE_VFORK 0x00004000
+
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef unsigned int u32;
+typedef signed int s32;
+typedef unsigned short u16;
+typedef signed short s16;
+typedef unsigned char u8;
+typedef signed char s8;
+
+#define MAJOR(dev) ((dev)>>8)
+
+#ifdef CONFIG_X86_64
+
+typedef struct {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+ unsigned long fs_base;
+ unsigned long gs_base;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+} user_regs_struct_t;
+
+typedef struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ unsigned short fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ u32 padding[24];
+} user_fpregs_struct_t;
+
+#else /* CONFIG_X86_64 */
+
+typedef struct {
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+} user_regs_struct_t;
+
+#endif /* CONFIG_X86_64 */
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#endif /* CR_TYPES_H_ */
diff --git a/include/util.h b/include/util.h
new file mode 100644
index 000000000..86d71e06f
--- /dev/null
+++ b/include/util.h
@@ -0,0 +1,178 @@
+#ifndef UTIL_H_
+#define UTIL_H_
+
+/*
+ * Some bits are stolen from perf and kvm tools
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include <sys/types.h>
+
+#include "compiler.h"
+#include "types.h"
+
+extern void printk(const char *format, ...);
+
+#define pr_info(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#define pr_error(fmt, ...) printk("Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define pr_panic(fmt, ...) printk("PANIC (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define pr_warning(fmt, ...) printk("Warning: " fmt, ##__VA_ARGS__)
+
+#define pr_error_jmp(label) \
+ do { \
+ printk("EJMP: %s:%d\n", __FILE__, __LINE__); \
+ goto label; \
+ } while (0)
+
+#define jerr(code, label) \
+ do { \
+ if ((code)) \
+ pr_error_jmp(label); \
+ } while (0)
+
+#define jerr_cond(code, cond, label) \
+ do { \
+ if ((code) cond) \
+ pr_error_jmp(label); \
+ } while (0)
+
+#define jerr_rc(code, rc, label) \
+ do { \
+ rc = (code); \
+ if (rc) \
+ pr_error_jmp(label); \
+ } while (0)
+
+#if 0
+#define pr_debug(fmt, ...) \
+ do { \
+ printk("%s (%s:%d): " fmt, \
+ __func__, __FILE__, __LINE__, \
+ ##__VA_ARGS__); \
+ } while (0)
+#else
+#define pr_debug(fmt, ...)
+#endif
+
+#define die(fmt, ...) \
+ do { \
+ printk("die (%s:%d): " fmt, __FILE__, \
+ __LINE__, ##__VA_ARGS__); \
+ exit(1); \
+ } while (0)
+
+#define pr_perror(fmt, ...) \
+ do { \
+ pr_error("%s: " fmt, strerror(errno), \
+ ##__VA_ARGS__); \
+ } while (0)
+
+#define stop_task(pid) kill(pid, SIGSTOP)
+#define continue_task(pid) kill(pid, SIGCONT)
+
+#define write_ptr(fd, ptr) \
+ write(fd, (ptr), sizeof(*(ptr)))
+
+#define write_ptr_safe(fd, ptr, err) \
+ jerr(write_ptr(fd, ptr) != sizeof(*(ptr)), err)
+
+#define write_safe(fd, ptr, size, err) \
+ jerr(write(fd, (ptr), (size)) != (size), err)
+
+#define write_safe_imm(fd, imm, err) \
+ do { \
+ typeof(imm) x__ = imm; \
+ write_ptr_safe(fd, &x__, err); \
+ } while (0)
+
+#define read_safe(fd, ptr, size, err) \
+ jerr(read(fd, ptr, (size)) != (size), err)
+
+#define read_ptr_safe(fd, ptr, err) \
+ jerr(read(fd, ptr, sizeof(*(ptr))) != sizeof(*(ptr)), err)
+
+#define read_safe_eof(fd, ptr, size, rc, err, eof) \
+ do { \
+ rc = read(fd, ptr, (size)); \
+ if (!rc) \
+ goto eof; \
+ if (rc != (size)) \
+ goto err; \
+ } while (0)
+
+#define read_ptr_safe_eof(fd, ptr, rc, err, eof) \
+ read_safe_eof(fd, ptr, sizeof(*(ptr)), rc, err, eof)
+
+int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
+int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
+int ptrace_show_area(pid_t pid, void *addr, long bytes);
+int ptrace_show_area_r(pid_t pid, void *addr, long bytes);
+
+int seize_task(pid_t pid);
+int unseize_task(pid_t pid);
+
+void printk_registers(user_regs_struct_t *regs);
+void printk_siginfo(siginfo_t *siginfo);
+
+struct vma_area;
+struct list_head;
+
+void printk_vma(struct vma_area *vma_area);
+
+/* A special marker */
+#define is_ending_vma(vma) ((vma)->start == 0 && (vma)->end == 0)
+
+#define pr_info_vma_list(head) \
+ do { \
+ struct vma_area *vma; \
+ list_for_each_entry(vma, head, list) \
+ pr_info_vma(vma); \
+ } while (0)
+
+#define alloc_vma_area() \
+ ({ \
+ struct vma_area *p__ = xzalloc(sizeof(*p__)); \
+ if (p__) { \
+ p__->shmid = -1; \
+ p__->vm_file_fd = -1; \
+ p__->vma.fd = -1; \
+ } \
+ p__; \
+ })
+
+#define pr_info_vma(vma_area) printk_vma(vma_area)
+#define pr_info_registers(regs) printk_registers(regs)
+#define pr_info_siginfo(siginfo) printk_siginfo(siginfo)
+
+int reopen_fd_as(int new_fd, int old_fd);
+int parse_maps(pid_t pid, struct list_head *vma_list);
+
+#define __xalloc(op, size, ...) \
+ ({ \
+ void *___p = op( __VA_ARGS__ ); \
+ if (!___p) \
+ pr_error("%s: Can't allocate %li bytes\n", \
+ __func__, (long)(size)); \
+ ___p; \
+ })
+
+#define xmalloc(size) __xalloc(malloc, size, size)
+#define xzalloc(size) __xalloc(calloc, size, 1, size)
+#define xrealloc(p, size) __xalloc(realloc, size, p, size)
+
+#define xfree(p) if (p) free(p)
+
+#define xrealloc_safe(pptr, size) \
+ ({ \
+ int __ret = -1; \
+ void *new = xrealloc(*pptr, size); \
+ if (new) { \
+ *pptr = new; \
+ __ret = 0; \
+ } \
+ __ret; \
+ })
+
+#endif /* UTIL_H_ */
diff --git a/kernel/binfmt-elf-for-cr-4 b/kernel/binfmt-elf-for-cr-4
new file mode 100644
index 000000000..b7218fbf6
--- /dev/null
+++ b/kernel/binfmt-elf-for-cr-4
@@ -0,0 +1,636 @@
+elf: Add support for loading files
+
+This patch add ability to run checkpoint files by enhancing
+Elf file format.
+
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+---
+ arch/x86/include/asm/elf.h | 3
+ arch/x86/vdso/vma.c | 22 ++
+ fs/binfmt_elf.c | 404 ++++++++++++++++++++++++++++++++++++++++++++-
+ include/linux/elf_ckpt.h | 135 +++++++++++++++
+ 4 files changed, 562 insertions(+), 2 deletions(-)
+
+Index: linux-2.6.git/arch/x86/include/asm/elf.h
+===================================================================
+--- linux-2.6.git.orig/arch/x86/include/asm/elf.h
++++ linux-2.6.git/arch/x86/include/asm/elf.h
+@@ -314,7 +314,8 @@ struct linux_binprm;
+ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+-
++extern int arch_setup_additional_pages_at(struct linux_binprm *bprm,
++ void *addr, int uses_interp);
+ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+ #define compat_arch_setup_additional_pages syscall32_setup_pages
+
+Index: linux-2.6.git/arch/x86/vdso/vma.c
+===================================================================
+--- linux-2.6.git.orig/arch/x86/vdso/vma.c
++++ linux-2.6.git/arch/x86/vdso/vma.c
+@@ -137,6 +137,28 @@ up_fail:
+ return ret;
+ }
+
++int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp)
++{
++ struct mm_struct *mm = current->mm;
++ int ret;
++
++ if (!vdso_enabled)
++ return 0;
++
++ down_write(&mm->mmap_sem);
++ current->mm->context.vdso = addr;
++ ret = install_special_mapping(mm, (unsigned long)addr, vdso_size,
++ VM_READ | VM_EXEC |
++ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
++ VM_ALWAYSDUMP,
++ vdso_pages);
++ if (ret)
++ current->mm->context.vdso = NULL;
++
++ up_write(&mm->mmap_sem);
++ return ret;
++}
++
+ static __init int vdso_setup(char *s)
+ {
+ vdso_enabled = simple_strtoul(s, NULL, 0);
+Index: linux-2.6.git/fs/binfmt_elf.c
+===================================================================
+--- linux-2.6.git.orig/fs/binfmt_elf.c
++++ linux-2.6.git/fs/binfmt_elf.c
+@@ -36,6 +36,11 @@
+ #include <asm/param.h>
+ #include <asm/page.h>
+
++#include <linux/elf_ckpt.h>
++#include <linux/flex_array.h>
++#include <asm/tlbflush.h>
++#include <asm/desc.h>
++
+ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+ static int load_elf_library(struct file *);
+ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
+@@ -556,6 +561,395 @@ static unsigned long randomize_stack_top
+ #endif
+ }
+
++#ifdef CONFIG_X86_64
++
++static int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
++ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
++{
++ struct thread_struct *thread = &current->thread;
++ struct elf_phdr *elf_phdr_pages;
++ struct elf_phdr *elf_phdr_core;
++ struct flex_array *fa = NULL;
++ struct vma_entry *vma_entry_ptr;
++ int nr_vma_found, nr_vma_mapped;
++ struct vma_entry vma_entry;
++ struct file *file = NULL;
++ unsigned long elf_entry;
++ unsigned long map_addr;
++
++ unsigned long start_code, end_code, start_data, end_data;
++ unsigned long start_brk, brk, start_stack;
++ unsigned long elf_bss, elf_brk;
++ unsigned long vdso;
++
++ struct core_entry core_entry;
++ int i, ret = -ENOEXEC;
++ loff_t off;
++
++ int cpu, seg;
++
++ BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES);
++ BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE);
++
++ elf_phdr_core = NULL;
++ elf_phdr_pages = NULL;
++ nr_vma_found = 0;
++ nr_vma_mapped = 0;
++
++ elf_bss = 0;
++ elf_brk = 0;
++
++ start_code = -1UL;
++ end_code = 0;
++
++ start_data = -1UL;
++ end_data = 0;
++
++ start_stack = -1UL;
++ start_brk = -1UL;
++ brk = -1UL;
++
++ vdso = -1UL;
++
++ fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL);
++ if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) {
++ ret = -ENOMEM;
++ if (fa) {
++ flex_array_free(fa);
++ fa = NULL;
++ goto out;
++ }
++ }
++
++ /* Flush all traces of the currently running executable */
++ ret = flush_old_exec(bprm);
++ if (ret)
++ goto out;
++
++ /* No return point */
++ current->flags &= ~PF_FORKNOEXEC;
++ current->mm->def_flags = 0;
++
++ /*
++ * We don't care about parameters passed (such as argc, argv, env)
++ * when execute checkpoint file because we're to substitute
++ * all the things anyway -- so drop any previous memory mappings.
++ */
++ do_munmap(current->mm, 0, TASK_SIZE);
++
++ SET_PERSONALITY(loc->elf_ex);
++
++ for (i = 0; i < elf_ex->e_phnum; i++) {
++
++ switch (elf_phdr[i].p_type) {
++ case PT_CKPT_VMA:
++ ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
++ (char *)&vma_entry, sizeof(vma_entry));
++ if (ret != sizeof(vma_entry)) {
++ pr_err("elf-ckpt: Can't read vma_entry\n");
++ ret = -EIO;
++ goto out;
++ }
++ if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL))
++ BUG();
++
++ /* We need to know if there is executable stack */
++ if (vma_entry.status & VMA_AREA_STACK) {
++ if (vma_entry.flags & PROT_EXEC)
++ current->personality |= READ_IMPLIES_EXEC;
++ }
++
++ nr_vma_found++;
++ continue;
++ case PT_CKPT_CORE:
++ elf_phdr_core = &elf_phdr[i];
++ continue;
++ case PT_CKPT_PAGES:
++ elf_phdr_pages = &elf_phdr[i];
++ continue;
++ default:
++ continue;
++ }
++ }
++
++ /* Be sure it has the file structure we expect to see. */
++ if (!elf_phdr_pages || !elf_phdr_core || !nr_vma_found) {
++ send_sig(SIGKILL, current, 0);
++ ret = -ENOEXEC;
++ goto out;
++ }
++
++ /*
++ * VMA randomization still needs to be set (just in case if
++ * the program we restore will exec something else later).
++ */
++ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
++ current->flags |= PF_RANDOMIZE;
++
++ setup_new_exec(bprm);
++
++ current->mm->free_area_cache = current->mm->mmap_base;
++ current->mm->cached_hole_size = 0;
++
++ for (i = 0; i < nr_vma_found; i++) {
++ vma_entry_ptr = flex_array_get(fa, i);
++
++ if (vma_entry_ptr->status & VMA_AREA_HEAP) {
++ start_brk = vma_entry_ptr->start;
++ }
++
++ if (vma_entry_ptr->status & VMA_AREA_VDSO) {
++ vdso = vma_entry_ptr->start;
++ }
++
++ if (!(vma_entry_ptr->status & VMA_AREA_REGULAR))
++ continue;
++
++ if (vma_entry_ptr->fd != -1) {
++ file = fget((unsigned int)vma_entry_ptr->fd);
++ if (!file) {
++ send_sig(SIGKILL, current, 0);
++ ret = -EBADF;
++ goto out_unmap;
++ }
++
++ /* Reuse this field to handle error cases */
++ vma_entry_ptr->fd = (__u64)file;
++ } else
++ file = NULL;
++
++ down_write(&current->mm->mmap_sem);
++ map_addr = do_mmap(file,
++ vma_entry_ptr->start,
++ vma_entry_ptr->end - vma_entry_ptr->start,
++ vma_entry_ptr->prot,
++ vma_entry_ptr->flags | MAP_FIXED,
++ vma_entry_ptr->pgoff);
++ up_write(&current->mm->mmap_sem);
++
++ if (file) {
++ fput(file);
++ do_close((unsigned int)vma_entry_ptr->fd);
++ }
++
++ if (BAD_ADDR(map_addr)) {
++ send_sig(SIGKILL, current, 0);
++ ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL;
++ goto out_unmap;
++ }
++
++ /*
++ * FIXME
++ * Some heuristics to guess previously loaded real
++ * elf file structure. Probably this things should
++ * be exported via /proc somewhere instead.
++ */
++
++ if (vma_entry_ptr->status & VMA_AREA_STACK) {
++ /* Note if stack is VM_GROWSUP -- it should be reversed */
++ start_stack = vma_entry_ptr->start;
++ }
++
++ if (vma_entry_ptr->prot & PROT_EXEC) {
++ if (start_code > vma_entry_ptr->start)
++ start_code = vma_entry_ptr->start;
++ if (end_code < vma_entry_ptr->end)
++ end_code = vma_entry_ptr->end;
++ } else {
++ /*
++ * Neither .bss nor .data was being file mapped.
++ * FIXME: .rodata are loaded by interp.
++ */
++ if (!file) {
++ if (vma_entry_ptr->prot & (PROT_WRITE)) {
++ if (start_data > vma_entry_ptr->start)
++ start_data = vma_entry_ptr->start;
++ if (end_data < vma_entry_ptr->end)
++ end_data = vma_entry_ptr->end;
++ }
++ }
++ }
++
++ nr_vma_mapped++;
++ }
++
++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
++ if (vdso == -1UL) {
++ pr_err("elf-ckpt: Can't find VDSO address\n");
++ ret = -ENOEXEC;
++ goto out_unmap;
++ }
++#endif
++
++ /* Restore core data */
++ ret = kernel_read(bprm->file, elf_phdr_core->p_offset,
++ (char *)&core_entry, sizeof(core_entry));
++ if (ret != sizeof(core_entry)) {
++ pr_err("elf-ckpt: Can't read core_entry\n");
++ ret = -EIO;
++ goto out_unmap;
++ }
++
++ elf_entry = core_entry.gpregs.ip;
++ bprm->p = start_stack;
++
++ current->mm->start_code = start_code;
++ current->mm->end_code = end_code;
++ current->mm->start_data = start_data;
++ current->mm->end_data = end_data;
++ current->mm->start_stack = start_stack;
++ current->mm->start_brk = start_brk;
++ current->mm->brk = brk;
++
++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
++ ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0);
++ if (ret) {
++ pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n",
++ vdso, ret);
++ goto out_unmap;
++ }
++#endif
++
++ /*
++ * Restore pages
++ */
++ off = elf_phdr_pages->p_offset;
++ while (1) {
++ struct vm_area_struct *vma;
++ struct page *page;
++ void *page_data;
++ __u64 va;
++
++ ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va));
++ if (ret != sizeof(va)) {
++ pr_err("elf-ckpt: Can't read page virtual address: "
++ "ret = %d off = %lx\n", ret, (unsigned long)off);
++ ret = -EIO;
++ goto out_unmap;
++ }
++
++ /* End of pages reached */
++ if (!va)
++ break;
++
++ vma = find_vma(current->mm, (unsigned long)va);
++ if (!vma) {
++ pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va);
++ ret = -ESRCH;
++ goto out_unmap;
++ }
++
++ ret = get_user_pages(current, current->mm, (unsigned long)va,
++ 1, 1, 1, &page, NULL);
++ if (ret != 1) {
++ pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va);
++ ret = -EFAULT;
++ goto out_unmap;
++ }
++
++ page_data = kmap(page);
++ ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE);
++ kunmap(page);
++ put_page(page);
++
++ if (ret != PAGE_SIZE) {
++ pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va);
++ ret = -EFAULT;
++ goto out_unmap;
++ }
++
++ off += sizeof(va) + PAGE_SIZE;
++ }
++
++ set_binfmt(&elf_format);
++
++ /*
++ * Registers setup.
++ *
++ * Since we might be modifying MSRs we're
++ * to be sure the task wont be preempted
++ * until modification is complete.
++ */
++ cpu = get_cpu();
++
++ regs->ip = core_entry.gpregs.ip;
++ regs->sp = core_entry.gpregs.sp;
++ regs->cs = core_entry.gpregs.cs;
++ regs->ss = core_entry.gpregs.ss;
++ regs->flags = core_entry.gpregs.flags;
++ regs->r15 = core_entry.gpregs.r15;
++ regs->r14 = core_entry.gpregs.r14;
++ regs->r13 = core_entry.gpregs.r13;
++ regs->r12 = core_entry.gpregs.r12;
++ regs->bp = core_entry.gpregs.bp;
++ regs->bx = core_entry.gpregs.bx;
++ regs->r11 = core_entry.gpregs.r11;
++ regs->r10 = core_entry.gpregs.r10;
++ regs->r8 = core_entry.gpregs.r8;
++ regs->ax = core_entry.gpregs.ax;
++ regs->cx = core_entry.gpregs.cx;
++ regs->dx = core_entry.gpregs.dx;
++ regs->si = core_entry.gpregs.si;
++ regs->di = core_entry.gpregs.di;
++ regs->orig_ax = core_entry.gpregs.orig_ax;
++
++ thread->usersp = core_entry.gpregs.sp;
++ thread->ds = core_entry.gpregs.ds;
++ thread->es = core_entry.gpregs.es;
++ thread->fs = core_entry.gpregs.fs;
++ thread->gs = core_entry.gpregs.gs;
++
++ thread->fsindex = thread->fs;
++ thread->gsindex = thread->gs;
++
++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
++ thread->tls_array[i].a = core_entry.tls_array[i].a;
++ thread->tls_array[i].b = core_entry.tls_array[i].b;
++ }
++
++ load_TLS(thread, cpu);
++
++ seg = thread->fsindex;
++ loadsegment(fs, seg);
++ savesegment(fs, seg);
++
++ if (seg != thread->fsindex) {
++ pr_err("Fixup on FS loading exception: %i %i\n",
++ thread->fsindex, seg);
++ }
++
++ if (core_entry.gpregs.fs_base)
++ wrmsrl(MSR_FS_BASE, core_entry.gpregs.fs_base);
++
++ if (core_entry.gpregs.gs_base)
++ wrmsrl(MSR_GS_BASE, core_entry.gpregs.gs_base);
++
++ put_cpu();
++
++ ret = 0;
++out:
++ if (fa)
++ flex_array_free(fa);
++ return ret;
++
++out_unmap:
++ for (i = 0; i < nr_vma_mapped; i++) {
++ vma_entry_ptr = flex_array_get(fa, i);
++ down_write(&current->mm->mmap_sem);
++ do_munmap(current->mm, vma_entry_ptr->start,
++ vma_entry_ptr->end - vma_entry_ptr->start);
++ up_write(&current->mm->mmap_sem);
++ }
++ goto out;
++}
++#else
++static int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
++ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
++{
++ return -ENOEXEC;
++}
++#endif
++
+ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+ {
+ struct file *interpreter = NULL; /* to shut gcc up */
+@@ -592,7 +986,9 @@ static int load_elf_binary(struct linux_
+ if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
+ goto out;
+
+- if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
++ if (loc->elf_ex.e_type != ET_EXEC &&
++ loc->elf_ex.e_type != ET_DYN &&
++ loc->elf_ex.e_type != ET_CKPT)
+ goto out;
+ if (!elf_check_arch(&loc->elf_ex))
+ goto out;
+@@ -619,6 +1015,12 @@ static int load_elf_binary(struct linux_
+ goto out_free_ph;
+ }
+
++ if (loc->elf_ex.e_type == ET_CKPT) {
++ retval = load_elf_ckpt(bprm, regs, &loc->elf_ex,
++ (struct elf_phdr *)elf_phdata);
++ goto out_free_ph;
++ }
++
+ elf_ppnt = elf_phdata;
+ elf_bss = 0;
+ elf_brk = 0;
+Index: linux-2.6.git/include/linux/elf_ckpt.h
+===================================================================
+--- /dev/null
++++ linux-2.6.git/include/linux/elf_ckpt.h
+@@ -0,0 +1,135 @@
++#ifndef _LINUX_ELF_CHECKPOINT_H
++#define _LINUX_ELF_CHECKPOINT_H
++
++#include <linux/types.h>
++#include <linux/elf-em.h>
++
++#ifdef __KERNEL__
++
++#include <asm/elf.h>
++
++/*
++ * Elf extension includes new Elf file type
++ * and program header types as well.
++ */
++#define ET_CKPT 5
++
++#define PT_CKPT_OFFSET 0x01010101
++
++#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1)
++#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2)
++#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3)
++
++#define CKPT_PAGE_SIZE 4096
++#define CKPT_GDT_ENTRY_TLS_ENTRIES 3
++
++#define HEADER_VERSION 1
++#define HEADER_ARCH_X86_64 1
++
++#define VMA_AREA_REGULAR (1 << 0)
++#define VMA_AREA_STACK (1 << 1)
++#define VMA_AREA_VSYSCALL (1 << 2)
++#define VMA_AREA_VDSO (1 << 3)
++#define VMA_FORCE_READ (1 << 4)
++#define VMA_AREA_HEAP (1 << 5)
++#define VMA_FILE_PRIVATE (1 << 6)
++#define VMA_FILE_SHARED (1 << 7)
++#define VMA_ANON_SHARED (1 << 8)
++#define VMA_ANON_PRIVATE (1 << 9)
++#define VMA_FORCE_WRITE (1 << 10)
++
++struct vma_entry {
++ __u64 start;
++ __u64 end;
++ __u64 pgoff;
++ __u32 prot;
++ __u32 flags;
++ __u32 status;
++ __u32 pid;
++ __s64 fd;
++ __u64 ino;
++ __u32 dev_maj;
++ __u32 dev_min;
++} __packed;
++
++struct page_entry {
++ __u64 va;
++ __u8 data[CKPT_PAGE_SIZE];
++} __packed;
++
++struct image_header {
++ __u16 version;
++ __u16 arch;
++ __u32 flags;
++} __packed;
++
++struct user_regs_entry {
++ __u64 r15;
++ __u64 r14;
++ __u64 r13;
++ __u64 r12;
++ __u64 bp;
++ __u64 bx;
++ __u64 r11;
++ __u64 r10;
++ __u64 r9;
++ __u64 r8;
++ __u64 ax;
++ __u64 cx;
++ __u64 dx;
++ __u64 si;
++ __u64 di;
++ __u64 orig_ax;
++ __u64 ip;
++ __u64 cs;
++ __u64 flags;
++ __u64 sp;
++ __u64 ss;
++ __u64 fs_base;
++ __u64 gs_base;
++ __u64 ds;
++ __u64 es;
++ __u64 fs;
++ __u64 gs;
++} __packed;
++
++struct desc_struct_entry {
++ union {
++ struct {
++ __u32 a;
++ __u32 b;
++ };
++ struct {
++ __u16 limit0;
++ __u16 base0;
++ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
++ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
++ };
++ };
++} __packed;
++
++struct user_fpregs_entry {
++ __u16 cwd;
++ __u16 swd;
++ __u16 twd;
++ __u16 fop;
++ __u64 rip;
++ __u64 rdp;
++ __u32 mxcsr;
++ __u32 mxcsr_mask;
++ __u32 st_space[32];
++ __u32 xmm_space[64];
++ __u32 padding[24];
++} __packed;
++
++struct core_entry {
++ struct image_header header;
++ struct user_regs_entry gpregs;
++ struct user_fpregs_entry fpregs;
++ struct desc_struct tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES];
++ __u32 personality;
++} __packed;
++
++#endif /* __KERNEL__ */
++
++#endif /* _LINUX_ELF_CHECKPOINT_H */
diff --git a/kernel/cr-clone-with-pid-support b/kernel/cr-clone-with-pid-support
new file mode 100644
index 000000000..cc4caf407
--- /dev/null
+++ b/kernel/cr-clone-with-pid-support
@@ -0,0 +1,172 @@
+Allow processes to be created with specified pid
+
+We will need it to restore processes so they would not
+even notice that they were being checkpointed.
+
+Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
+---
+ include/linux/pid.h | 2 -
+ include/linux/sched.h | 1
+ kernel/fork.c | 10 ++++++-
+ kernel/pid.c | 70 ++++++++++++++++++++++++++++++++++++--------------
+ 4 files changed, 62 insertions(+), 21 deletions(-)
+
+Index: linux-2.6.git/include/linux/pid.h
+===================================================================
+--- linux-2.6.git.orig/include/linux/pid.h
++++ linux-2.6.git/include/linux/pid.h
+@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
+ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
+ int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
+
+-extern struct pid *alloc_pid(struct pid_namespace *ns);
++extern struct pid *alloc_pid(struct pid_namespace *ns, int pid);
+ extern void free_pid(struct pid *pid);
+
+ /*
+Index: linux-2.6.git/include/linux/sched.h
+===================================================================
+--- linux-2.6.git.orig/include/linux/sched.h
++++ linux-2.6.git/include/linux/sched.h
+@@ -23,6 +23,7 @@
+ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
+ /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+ and is now available for re-use. */
++#define CLONE_CHILD_USEPID 0x02000000 /* use the given pid */
+ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */
+ #define CLONE_NEWIPC 0x08000000 /* New ipcs */
+ #define CLONE_NEWUSER 0x10000000 /* New user namespace */
+Index: linux-2.6.git/kernel/fork.c
+===================================================================
+--- linux-2.6.git.orig/kernel/fork.c
++++ linux-2.6.git/kernel/fork.c
+@@ -1239,8 +1239,16 @@ static struct task_struct *copy_process(
+ goto bad_fork_cleanup_io;
+
+ if (pid != &init_struct_pid) {
++ int want_pid = 0;
++
++ if (clone_flags & CLONE_CHILD_USEPID) {
++ retval = get_user(want_pid, child_tidptr);
++ if (retval)
++ goto bad_fork_cleanup_io;
++ }
++
+ retval = -ENOMEM;
+- pid = alloc_pid(p->nsproxy->pid_ns);
++ pid = alloc_pid(p->nsproxy->pid_ns, want_pid);
+ if (!pid)
+ goto bad_fork_cleanup_io;
+ }
+Index: linux-2.6.git/kernel/pid.c
+===================================================================
+--- linux-2.6.git.orig/kernel/pid.c
++++ linux-2.6.git/kernel/pid.c
+@@ -159,11 +159,55 @@ static void set_last_pid(struct pid_name
+ } while ((prev != last_write) && (pid_before(base, last_write, pid)));
+ }
+
+-static int alloc_pidmap(struct pid_namespace *pid_ns)
++static int alloc_pidmap_page(struct pidmap *map)
++{
++ if (unlikely(!map->page)) {
++ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
++ /*
++ * Free the page if someone raced with us
++ * installing it:
++ */
++ spin_lock_irq(&pidmap_lock);
++ if (!map->page) {
++ map->page = page;
++ page = NULL;
++ }
++ spin_unlock_irq(&pidmap_lock);
++ kfree(page);
++ if (unlikely(!map->page))
++ return -ENOMEM;
++ }
++
++ return 0;
++}
++
++static int set_pidmap(struct pid_namespace *pid_ns, int pid)
++{
++ int offset;
++ struct pidmap *map;
++
++ offset = pid & BITS_PER_PAGE_MASK;
++ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
++
++ if (alloc_pidmap_page(map) < 0)
++ return -ENOMEM;
++
++ if (!test_and_set_bit(offset, map->page)) {
++ atomic_dec(&map->nr_free);
++ return pid;
++ }
++
++ return -EBUSY;
++}
++
++static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid)
+ {
+ int i, offset, max_scan, pid, last = pid_ns->last_pid;
+ struct pidmap *map;
+
++ if (desired_pid)
++ return set_pidmap(pid_ns, desired_pid);
++
+ pid = last + 1;
+ if (pid >= pid_max)
+ pid = RESERVED_PIDS;
+@@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_names
+ */
+ max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
+ for (i = 0; i <= max_scan; ++i) {
+- if (unlikely(!map->page)) {
+- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+- /*
+- * Free the page if someone raced with us
+- * installing it:
+- */
+- spin_lock_irq(&pidmap_lock);
+- if (!map->page) {
+- map->page = page;
+- page = NULL;
+- }
+- spin_unlock_irq(&pidmap_lock);
+- kfree(page);
+- if (unlikely(!map->page))
+- break;
+- }
++ if (alloc_pidmap_page(map) < 0)
++ break;
++
+ if (likely(atomic_read(&map->nr_free))) {
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+@@ -277,7 +308,7 @@ void free_pid(struct pid *pid)
+ call_rcu(&pid->rcu, delayed_put_pid);
+ }
+
+-struct pid *alloc_pid(struct pid_namespace *ns)
++struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid)
+ {
+ struct pid *pid;
+ enum pid_type type;
+@@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespa
+
+ tmp = ns;
+ for (i = ns->level; i >= 0; i--) {
+- nr = alloc_pidmap(tmp);
++ nr = alloc_pidmap(tmp, this_ns_pid);
+ if (nr < 0)
+ goto out_free;
+
+ pid->numbers[i].nr = nr;
+ pid->numbers[i].ns = tmp;
+ tmp = tmp->parent;
++ this_ns_pid = 0;
+ }
+
+ get_pid_ns(ns);
diff --git a/kernel/cr-proc-add-children b/kernel/cr-proc-add-children
new file mode 100644
index 000000000..d307a6024
--- /dev/null
+++ b/kernel/cr-proc-add-children
@@ -0,0 +1,46 @@
+proc: Introduce the Children: line in /proc/<pid>/status
+
+From: Pavel Emelyanov <xemul@parallels.com>
+
+Although we can get the pids of some task's issue, this is just
+more convenient to have them this way.
+
+Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+---
+ fs/proc/array.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+Index: linux-2.6.git/fs/proc/array.c
+===================================================================
+--- linux-2.6.git.orig/fs/proc/array.c
++++ linux-2.6.git/fs/proc/array.c
+@@ -158,6 +158,18 @@ static inline const char *get_task_state
+ return *p;
+ }
+
++static void task_children(struct seq_file *m, struct task_struct *p, struct pid_namespace *ns)
++{
++ struct task_struct *c;
++
++ seq_printf(m, "Children:");
++ read_lock(&tasklist_lock);
++ list_for_each_entry(c, &p->children, sibling)
++ seq_printf(m, " %d", pid_nr_ns(task_pid(c), ns));
++ read_unlock(&tasklist_lock);
++ seq_putc(m, '\n');
++}
++
+ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *p)
+ {
+@@ -192,6 +204,8 @@ static inline void task_state(struct seq
+ cred->uid, cred->euid, cred->suid, cred->fsuid,
+ cred->gid, cred->egid, cred->sgid, cred->fsgid);
+
++ task_children(m, p, ns);
++
+ task_lock(p);
+ if (p->files)
+ fdt = files_fdtable(p->files);
diff --git a/kernel/cr-proc-map-files-21 b/kernel/cr-proc-map-files-21
new file mode 100644
index 000000000..ccf8fbf94
--- /dev/null
+++ b/kernel/cr-proc-map-files-21
@@ -0,0 +1,522 @@
+fs, proc: Introduce the /proc/<pid>/map_files/ directory v14
+
+From: Pavel Emelyanov <xemul@parallels.com>
+
+This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
+one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
+the target is the file. Opening a symlink results in a file that point exactly
+to the same inode as them vma's one.
+
+For example the ls -l of some arbitrary /proc/<pid>/map_files/
+
+ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
+ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
+ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
+ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
+ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
+
+This *helps* checkpointing process in three ways:
+
+1. When dumping a task mappings we do know exact file that is mapped by particular
+ region. We do this by opening /proc/$pid/map_files/$address symlink the way we do
+ with file descriptors.
+
+2. This also helps in determining which anonymous shared mappings are shared with
+ each other by comparing the inodes of them.
+
+3. When restoring a set of processes in case two of them has a mapping shared, we map
+ the memory by the 1st one and then open its /proc/$pid/map_files/$address file and
+ map it by the 2nd task.
+
+Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
+re-reading and reparsing for this text file which slows down restore procedure
+significantly. Also as being pointed in (3) it is a way easier to use top level
+shared mapping in children as /proc/$pid/map_files/$address when needed.
+
+v2: (spotted by Tejun Heo)
+ - /proc/<pid>/mfd changed to /proc/<pid>/map_files
+ - find_vma helper is used instead of linear search
+ - routines are re-grouped
+ - d_revalidate is set now
+
+v3:
+ - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
+ - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
+ - because of filldir (which eventually might need to lock mmap_sem)
+ the proc_map_files_readdir() was reworked to call proc_fill_cache()
+ with unlocked mmap_sem
+
+v4: (feedback by Tejun Heo and Vasiliy Kulikov)
+ - instead of saving data in proc_inode we rather make a dentry name
+ to keep both vm_start and vm_end accordingly
+ - d_revalidate now honor task credentials
+
+v5: (feedback by Kirill A. Shutemov)
+ - don't forget to release mmap_sem on error path
+
+v6:
+ - sizeof get used in map_files_info which shrink member a bit on
+ x86-32 (by Kirill A. Shutemov)
+ - map_name_to_addr returns -EINVAL instead of -1
+ which is more appropriate (by Tejun Heo)
+
+v7:
+ - add [get/set]attr handlers for
+ proc_map_files_inode_operations (by Vasiliy Kulikov)
+
+v8:
+ - Kirill A. Shutemov spotted a parasite semicolon
+ which ruined the ptrace_check call, fixed.
+
+v9: (feedback by Andrew Morton)
+ - find_exact_vma moved into include/linux/mm.h as an inline helper
+ - proc_map_files_setattr uses either kmalloc or vmalloc depending
+ on how many objects are to be allocated
+ - no more map_name_to_addr but dname_to_vma_addr introduced instead
+ and it uses sscanf because in one case the find_exact_vma() is used
+ only to confirm existence of vma area the boolean flag is used
+ - fancy justification dropped
+ - still the proc_map_files_get/setattr leaved untouched
+ until additional fd/ patches applied first.
+
+v10: (feedback by Andrew Morton)
+ - flex_arrays are used instead of kmalloc/vmalloc calls
+ - map_files_d_revalidate use ptrace_may_access for
+ security reason (by Vasiliy Kulikov)
+
+v11:
+ - should use fput and drop !ret test from a loop code
+ (feedback by Andrew Morton)
+ - no need for 'used' variable, use existing
+ nr_files with file->pos predicate
+ - if preallocation fails no need to go further,
+ simply release mmap semaphore and jump out
+
+v12:
+ - rework map_files_d_revalidate to make sure
+ the task get released on return (by Vasiliy Kulikov)
+
+v13:
+ - proc_map_files_inode_operations are set to be the same
+ as proc_fd_inode_operations, ie to include .permission
+ pointing to proc_fd_permission
+
+v14: (by Vasiliy Kulikov)
+ - for security reason map_files/ entries are allowed for
+ readers with CAP_SYS_ADMIN credentials granted only
+
+Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+Reviewed-by: Vasiliy Kulikov <segoon@openwall.com>
+CC: Tejun Heo <tj@kernel.org>
+CC: Vasiliy Kulikov <segoon@openwall.com>
+CC: "Kirill A. Shutemov" <kirill@shutemov.name>
+CC: Alexey Dobriyan <adobriyan@gmail.com>
+CC: Al Viro <viro@ZenIV.linux.org.uk>
+CC: Andrew Morton <akpm@linux-foundation.org>
+CC: Pavel Machek <pavel@ucw.cz>
+---
+ fs/proc/base.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/mm.h | 12 +
+ 2 files changed, 357 insertions(+)
+
+Index: linux-2.6.git/fs/proc/base.c
+===================================================================
+--- linux-2.6.git.orig/fs/proc/base.c
++++ linux-2.6.git/fs/proc/base.c
+@@ -83,6 +83,7 @@
+ #include <linux/pid_namespace.h>
+ #include <linux/fs_struct.h>
+ #include <linux/slab.h>
++#include <linux/flex_array.h>
+ #ifdef CONFIG_HARDWALL
+ #include <asm/hardwall.h>
+ #endif
+@@ -133,6 +134,8 @@ struct pid_entry {
+ NULL, &proc_single_file_operations, \
+ { .proc_show = show } )
+
++static int proc_fd_permission(struct inode *inode, int mask);
++
+ /*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+@@ -2201,6 +2204,347 @@ static const struct file_operations proc
+ };
+
+ /*
++ * dname_to_vma_addr - maps a dentry name into two unsigned longs
++ * which represent vma start and end addresses.
++ */
++static int dname_to_vma_addr(struct dentry *dentry,
++ unsigned long *start, unsigned long *end)
++{
++ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
++ return -EINVAL;
++
++ return 0;
++}
++
++static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
++{
++ unsigned long vm_start, vm_end;
++ bool exact_vma_exists = false;
++ struct mm_struct *mm = NULL;
++ struct task_struct *task;
++ const struct cred *cred;
++ struct inode *inode;
++ int status = 0;
++
++ if (nd && nd->flags & LOOKUP_RCU)
++ return -ECHILD;
++
++ if (!capable(CAP_SYS_ADMIN)) {
++ status = -EACCES;
++ goto out_notask;
++ }
++
++ inode = dentry->d_inode;
++ task = get_proc_task(inode);
++ if (!task)
++ goto out_notask;
++
++ if (!ptrace_may_access(task, PTRACE_MODE_READ))
++ goto out;
++
++ mm = get_task_mm(task);
++ if (!mm)
++ goto out;
++
++ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
++ down_read(&mm->mmap_sem);
++ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
++ up_read(&mm->mmap_sem);
++ }
++
++ mmput(mm);
++
++ if (exact_vma_exists) {
++ if (task_dumpable(task)) {
++ rcu_read_lock();
++ cred = __task_cred(task);
++ inode->i_uid = cred->euid;
++ inode->i_gid = cred->egid;
++ rcu_read_unlock();
++ } else {
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ }
++ security_task_to_inode(task, inode);
++ status = 1;
++ }
++
++out:
++ put_task_struct(task);
++
++out_notask:
++ if (status <= 0)
++ d_drop(dentry);
++
++ return status;
++}
++
++static const struct dentry_operations tid_map_files_dentry_operations = {
++ .d_revalidate = map_files_d_revalidate,
++ .d_delete = pid_delete_dentry,
++};
++
++static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
++{
++ unsigned long vm_start, vm_end;
++ struct vm_area_struct *vma;
++ struct task_struct *task;
++ struct mm_struct *mm;
++ int rc;
++
++ rc = -ENOENT;
++ task = get_proc_task(dentry->d_inode);
++ if (!task)
++ goto out;
++
++ mm = get_task_mm(task);
++ put_task_struct(task);
++ if (!mm)
++ goto out;
++
++ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
++ if (rc)
++ goto out_mmput;
++
++ down_read(&mm->mmap_sem);
++ vma = find_exact_vma(mm, vm_start, vm_end);
++ if (vma && vma->vm_file) {
++ *path = vma->vm_file->f_path;
++ path_get(path);
++ rc = 0;
++ }
++ up_read(&mm->mmap_sem);
++
++out_mmput:
++ mmput(mm);
++out:
++ return rc;
++}
++
++struct map_files_info {
++ struct file *file;
++ unsigned long len;
++ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
++};
++
++static struct dentry *
++proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
++ struct task_struct *task, const void *ptr)
++{
++ const struct file *file = ptr;
++ struct proc_inode *ei;
++ struct inode *inode;
++
++ if (!file)
++ return ERR_PTR(-ENOENT);
++
++ inode = proc_pid_make_inode(dir->i_sb, task);
++ if (!inode)
++ return ERR_PTR(-ENOENT);
++
++ ei = PROC_I(inode);
++ ei->op.proc_get_link = proc_map_files_get_link;
++
++ inode->i_op = &proc_pid_link_inode_operations;
++ inode->i_size = 64;
++ inode->i_mode = S_IFLNK;
++
++ if (file->f_mode & FMODE_READ)
++ inode->i_mode |= S_IRUSR;
++ if (file->f_mode & FMODE_WRITE)
++ inode->i_mode |= S_IWUSR;
++
++ d_set_d_op(dentry, &tid_map_files_dentry_operations);
++ d_add(dentry, inode);
++
++ return NULL;
++}
++
++static struct dentry *proc_map_files_lookup(struct inode *dir,
++ struct dentry *dentry, struct nameidata *nd)
++{
++ unsigned long vm_start, vm_end;
++ struct vm_area_struct *vma;
++ struct task_struct *task;
++ struct dentry *result;
++ struct mm_struct *mm;
++
++ result = ERR_PTR(-EACCES);
++ if (!capable(CAP_SYS_ADMIN))
++ goto out;
++
++ result = ERR_PTR(-ENOENT);
++ task = get_proc_task(dir);
++ if (!task)
++ goto out;
++
++ result = ERR_PTR(-EACCES);
++ if (lock_trace(task))
++ goto out_put_task;
++
++ result = ERR_PTR(-ENOENT);
++ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
++ goto out_unlock;
++
++ mm = get_task_mm(task);
++ if (!mm)
++ goto out_unlock;
++
++ down_read(&mm->mmap_sem);
++ vma = find_exact_vma(mm, vm_start, vm_end);
++ if (!vma)
++ goto out_no_vma;
++
++ result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
++
++out_no_vma:
++ up_read(&mm->mmap_sem);
++ mmput(mm);
++out_unlock:
++ unlock_trace(task);
++out_put_task:
++ put_task_struct(task);
++out:
++ return result;
++}
++
++static const struct inode_operations proc_map_files_inode_operations = {
++ .lookup = proc_map_files_lookup,
++ .permission = proc_fd_permission,
++ .setattr = proc_setattr,
++};
++
++static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
++{
++ struct dentry *dentry = filp->f_path.dentry;
++ struct inode *inode = dentry->d_inode;
++ struct vm_area_struct *vma;
++ struct task_struct *task;
++ struct mm_struct *mm;
++ ino_t ino;
++ int ret;
++
++ ret = -EACCES;
++ if (!capable(CAP_SYS_ADMIN))
++ goto out;
++
++ ret = -ENOENT;
++ task = get_proc_task(inode);
++ if (!task)
++ goto out;
++
++ ret = -EACCES;
++ if (lock_trace(task))
++ goto out_put_task;
++
++ ret = 0;
++ switch (filp->f_pos) {
++ case 0:
++ ino = inode->i_ino;
++ if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
++ goto out_unlock;
++ filp->f_pos++;
++ case 1:
++ ino = parent_ino(dentry);
++ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
++ goto out_unlock;
++ filp->f_pos++;
++ default:
++ {
++ unsigned long nr_files, pos, i;
++ struct flex_array *fa = NULL;
++ struct map_files_info info;
++ struct map_files_info *p;
++
++ mm = get_task_mm(task);
++ if (!mm)
++ goto out_unlock;
++ down_read(&mm->mmap_sem);
++
++ nr_files = 0;
++
++ /*
++ * We need two passes here:
++ *
++ * 1) Collect vmas of mapped files with mmap_sem taken
++ * 2) Release mmap_sem and instantiate entries
++ *
++ * otherwise we get lockdep complained, since filldir()
++ * routine might require mmap_sem taken in might_fault().
++ */
++
++ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
++ if (vma->vm_file && ++pos > filp->f_pos)
++ nr_files++;
++ }
++
++ if (nr_files) {
++ fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
++ if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
++ ret = -ENOMEM;
++ if (fa)
++ flex_array_free(fa);
++ up_read(&mm->mmap_sem);
++ mmput(mm);
++ goto out_unlock;
++ }
++ for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
++ if (!vma->vm_file)
++ continue;
++ if (++pos <= filp->f_pos)
++ continue;
++
++ get_file(vma->vm_file);
++ info.file = vma->vm_file;
++ info.len = snprintf(info.name, sizeof(info.name),
++ "%lx-%lx", vma->vm_start,
++ vma->vm_end);
++ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
++ BUG();
++ }
++ }
++ up_read(&mm->mmap_sem);
++
++ for (i = 0; i < nr_files; i++) {
++ p = flex_array_get(fa, i);
++ ret = proc_fill_cache(filp, dirent, filldir,
++ p->name, p->len,
++ proc_map_files_instantiate,
++ task, p->file);
++ if (ret)
++ break;
++ filp->f_pos++;
++ fput(p->file);
++ }
++ for (; i < nr_files; i++) {
++ /*
++ * In case of error don't forget
++ * to put rest of file refs.
++ */
++ p = flex_array_get(fa, i);
++ fput(p->file);
++ }
++ if (fa)
++ flex_array_free(fa);
++ mmput(mm);
++ }
++ }
++
++out_unlock:
++ unlock_trace(task);
++out_put_task:
++ put_task_struct(task);
++out:
++ return ret;
++}
++
++static const struct file_operations proc_map_files_operations = {
++ .read = generic_read_dir,
++ .readdir = proc_map_files_readdir,
++ .llseek = default_llseek,
++};
++
++/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+@@ -2815,6 +3159,7 @@ static const struct inode_operations pro
+ static const struct pid_entry tgid_base_stuff[] = {
+ DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+ DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
++ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+ DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+ #ifdef CONFIG_NET
+Index: linux-2.6.git/include/linux/mm.h
+===================================================================
+--- linux-2.6.git.orig/include/linux/mm.h
++++ linux-2.6.git/include/linux/mm.h
+@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
+ return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ }
+
++/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
++static inline struct vm_area_struct *
++find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
++{
++ struct vm_area_struct *vma = find_vma(mm, vm_start);
++
++ if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
++ vma = NULL;
++
++ return vma;
++}
++
+ #ifdef CONFIG_MMU
+ pgprot_t vm_get_page_prot(unsigned long vm_flags);
+ #else
diff --git a/kernel/cr-statfs-callback-for-pipefs b/kernel/cr-statfs-callback-for-pipefs
new file mode 100644
index 000000000..6fae692af
--- /dev/null
+++ b/kernel/cr-statfs-callback-for-pipefs
@@ -0,0 +1,27 @@
+vfs: Add ->statfs callback for pipefs
+
+From: Pavel Emelyanov <xemul@parallels.com>
+
+This is done to make it possible to distinguish pipes
+from fifos when opening one via /proc/<pid>/fd/ link.
+
+Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
+Reviewed-by: Tejun Heo <tj@kernel.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+---
+ fs/pipe.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+Index: linux-2.6.git/fs/pipe.c
+===================================================================
+--- linux-2.6.git.orig/fs/pipe.c
++++ linux-2.6.git/fs/pipe.c
+@@ -1254,6 +1254,7 @@ out:
+
+ static const struct super_operations pipefs_ops = {
+ .destroy_inode = free_inode_nonrcu,
++ .statfs = simple_statfs,
+ };
+
+ /*
diff --git a/kernel/fs-add-do-close b/kernel/fs-add-do-close
new file mode 100644
index 000000000..d19ea6ae0
--- /dev/null
+++ b/kernel/fs-add-do-close
@@ -0,0 +1,86 @@
+fs: Add do_close helper
+
+To be able to close file descriptors right from inside
+kernel space do_close() helper is added. We need it at
+checkpoint restore time.
+
+Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+---
+ fs/open.c | 32 ++++++++++++++++++++------------
+ include/linux/fs.h | 1 +
+ 2 files changed, 21 insertions(+), 12 deletions(-)
+
+Index: linux-2.6.git/fs/open.c
+===================================================================
+--- linux-2.6.git.orig/fs/open.c
++++ linux-2.6.git/fs/open.c
+@@ -1056,17 +1056,11 @@ int filp_close(struct file *filp, fl_own
+
+ EXPORT_SYMBOL(filp_close);
+
+-/*
+- * Careful here! We test whether the file pointer is NULL before
+- * releasing the fd. This ensures that one clone task can't release
+- * an fd while another clone is opening it.
+- */
+-SYSCALL_DEFINE1(close, unsigned int, fd)
++int do_close(unsigned int fd)
+ {
+ struct file * filp;
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+- int retval;
+
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+@@ -1079,7 +1073,25 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
+ FD_CLR(fd, fdt->close_on_exec);
+ __put_unused_fd(files, fd);
+ spin_unlock(&files->file_lock);
+- retval = filp_close(filp, files);
++
++ return filp_close(filp, files);
++
++out_unlock:
++ spin_unlock(&files->file_lock);
++ return -EBADF;
++}
++EXPORT_SYMBOL_GPL(do_close);
++
++/*
++ * Careful here! We test whether the file pointer is NULL before
++ * releasing the fd. This ensures that one clone task can't release
++ * an fd while another clone is opening it.
++ */
++SYSCALL_DEFINE1(close, unsigned int, fd)
++{
++ int retval;
++
++ retval = do_close(fd);
+
+ /* can't restart close syscall because file table entry was cleared */
+ if (unlikely(retval == -ERESTARTSYS ||
+@@ -1089,10 +1101,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
+ retval = -EINTR;
+
+ return retval;
+-
+-out_unlock:
+- spin_unlock(&files->file_lock);
+- return -EBADF;
+ }
+ EXPORT_SYMBOL(sys_close);
+
+Index: linux-2.6.git/include/linux/fs.h
+===================================================================
+--- linux-2.6.git.orig/include/linux/fs.h
++++ linux-2.6.git/include/linux/fs.h
+@@ -2027,6 +2027,7 @@ extern struct file *file_open_root(struc
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
+ const struct cred *);
+ extern int filp_close(struct file *, fl_owner_t id);
++extern int do_close(unsigned int fd);
+ extern char * getname(const char __user *);
+
+ /* fs/ioctl.c */
diff --git a/kernel/fs-proc-add-tls b/kernel/fs-proc-add-tls
new file mode 100644
index 000000000..eb0d9f620
--- /dev/null
+++ b/kernel/fs-proc-add-tls
@@ -0,0 +1,45 @@
+fs, proc: Add /proc/$pid/tls entry
+
+To be able to restart checkpointed tasks we need
+to know TLS status at dumping time. Export this
+information by /proc/$pid/tls entry.
+
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+---
+ fs/proc/base.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+Index: linux-2.6.git/fs/proc/base.c
+===================================================================
+--- linux-2.6.git.orig/fs/proc/base.c
++++ linux-2.6.git/fs/proc/base.c
+@@ -3150,6 +3150,21 @@ static int proc_pid_personality(struct s
+ return err;
+ }
+
++static int proc_pid_tls(struct seq_file *m, struct pid_namespace *ns,
++ struct pid *pid, struct task_struct *task)
++{
++ int err = lock_trace(task);
++ if (!err) {
++ int i;
++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++ seq_printf(m, "%x %x\n",
++ task->thread.tls_array[i].a,
++ task->thread.tls_array[i].b);
++ unlock_trace(task);
++ }
++ return err;
++}
++
+ /*
+ * Thread groups
+ */
+@@ -3169,6 +3184,7 @@ static const struct pid_entry tgid_base_
+ INF("auxv", S_IRUSR, proc_pid_auxv),
+ ONE("status", S_IRUGO, proc_pid_status),
+ ONE("personality", S_IRUGO, proc_pid_personality),
++ ONE("tls", S_IRUGO, proc_pid_tls),
+ INF("limits", S_IRUGO, proc_pid_limits),
+ #ifdef CONFIG_SCHED_DEBUG
+ REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
diff --git a/kernel/fs-proc-switch-to-dentry b/kernel/fs-proc-switch-to-dentry
new file mode 100644
index 000000000..4f29d286e
--- /dev/null
+++ b/kernel/fs-proc-switch-to-dentry
@@ -0,0 +1,108 @@
+fs, proc: Make proc_get_link to use dentry instead of inode
+
+This patch prepares the ground for the next "map_files"
+patch which needs a name of a link file to analyse.
+
+So instead of squashing this change into one big
+patch the separate one is done.
+
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+CC: Pavel Emelyanov <xemul@parallels.com>
+CC: Tejun Heo <tj@kernel.org>
+CC: Vasiliy Kulikov <segoon@openwall.com>
+CC: "Kirill A. Shutemov" <kirill@shutemov.name>
+CC: Alexey Dobriyan <adobriyan@gmail.com>
+CC: Al Viro <viro@ZenIV.linux.org.uk>
+CC: Andrew Morton <akpm@linux-foundation.org>
+---
+ fs/proc/base.c | 20 ++++++++++----------
+ include/linux/proc_fs.h | 2 +-
+ 2 files changed, 11 insertions(+), 11 deletions(-)
+
+Index: linux-2.6.git/fs/proc/base.c
+===================================================================
+--- linux-2.6.git.orig/fs/proc/base.c
++++ linux-2.6.git/fs/proc/base.c
+@@ -165,9 +165,9 @@ static int get_task_root(struct task_str
+ return result;
+ }
+
+-static int proc_cwd_link(struct inode *inode, struct path *path)
++static int proc_cwd_link(struct dentry *dentry, struct path *path)
+ {
+- struct task_struct *task = get_proc_task(inode);
++ struct task_struct *task = get_proc_task(dentry->d_inode);
+ int result = -ENOENT;
+
+ if (task) {
+@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i
+ return result;
+ }
+
+-static int proc_root_link(struct inode *inode, struct path *path)
++static int proc_root_link(struct dentry *dentry, struct path *path)
+ {
+- struct task_struct *task = get_proc_task(inode);
++ struct task_struct *task = get_proc_task(dentry->d_inode);
+ int result = -ENOENT;
+
+ if (task) {
+@@ -1580,13 +1580,13 @@ static const struct file_operations proc
+ .release = single_release,
+ };
+
+-static int proc_exe_link(struct inode *inode, struct path *exe_path)
++static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
+ {
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct file *exe_file;
+
+- task = get_proc_task(inode);
++ task = get_proc_task(dentry->d_inode);
+ if (!task)
+ return -ENOENT;
+ mm = get_task_mm(task);
+@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
+ if (!proc_fd_access_allowed(inode))
+ goto out;
+
+- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
++ error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
+ out:
+ return ERR_PTR(error);
+ }
+@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
+ if (!proc_fd_access_allowed(inode))
+ goto out;
+
+- error = PROC_I(inode)->op.proc_get_link(inode, &path);
++ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
+ if (error)
+ goto out;
+
+@@ -1959,9 +1959,9 @@ out_task:
+ return rc;
+ }
+
+-static int proc_fd_link(struct inode *inode, struct path *path)
++static int proc_fd_link(struct dentry *dentry, struct path *path)
+ {
+- return proc_fd_info(inode, path, NULL);
++ return proc_fd_info(dentry->d_inode, path, NULL);
+ }
+
+ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
+Index: linux-2.6.git/include/linux/proc_fs.h
+===================================================================
+--- linux-2.6.git.orig/include/linux/proc_fs.h
++++ linux-2.6.git/include/linux/proc_fs.h
+@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
+ extern const struct proc_ns_operations ipcns_operations;
+
+ union proc_op {
+- int (*proc_get_link)(struct inode *, struct path *);
++ int (*proc_get_link)(struct dentry *, struct path *);
+ int (*proc_read)(struct task_struct *task, char *page);
+ int (*proc_show)(struct seq_file *m,
+ struct pid_namespace *ns, struct pid *pid,
diff --git a/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch b/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch
new file mode 100644
index 000000000..70d259330
--- /dev/null
+++ b/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch
@@ -0,0 +1,28 @@
+From: Vasiliy Kulikov <segooon@gmail.com>
+
+In the patch "proc: fix races against execve() of /proc/PID/fd**"
+proc_pid_fd_link_getattr() leaked task_struct if ptrace check fails.
+
+Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
+Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ fs/proc/base.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix fs/proc/base.c
+--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix
++++ a/fs/proc/base.c
+@@ -1681,9 +1681,9 @@ static int proc_pid_fd_link_getattr(stru
+
+ generic_fillattr(inode, stat);
+ unlock_trace(task);
+- put_task_struct(task);
+ rc = 0;
+ out_task:
++ put_task_struct(task);
+ return rc;
+ }
+
+_
diff --git a/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch b/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch
new file mode 100644
index 000000000..8c2a4a18a
--- /dev/null
+++ b/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch
@@ -0,0 +1,255 @@
+From: Vasiliy Kulikov <segoon@openwall.com>
+
+fd* files are restricted to the task's owner, and other users may not get
+direct access to them. But one may open any of these files and run any
+setuid program, keeping opened file descriptors. As there are permission
+checks on open(), but not on readdir() and read(), operations on the kept
+file descriptors will not be checked. It makes it possible to violate
+procfs permission model.
+
+Reading fdinfo/* may disclosure current fds' position and flags, reading
+directory contents of fdinfo/ and fd/ may disclosure the number of opened
+files by the target task. This information is not sensible per se, but it
+can reveal some private information (like length of a password stored in a
+file) under certain conditions.
+
+Used existing (un)lock_trace functions to check for ptrace_may_access(),
+but instead of using EPERM return code from it use EACCES to be consistent
+with existing proc_pid_follow_link()/proc_pid_readlink() return code. If
+they differ, attacker can guess what fds exist by analyzing stat() return
+code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*,
+readdir() and lookup() for fd/ and fdinfo/.
+
+Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
+Cc: Cyrill Gorcunov <gorcunov@gmail.com>
+Cc: <stable@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ fs/proc/base.c | 146 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 103 insertions(+), 43 deletions(-)
+
+diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd fs/proc/base.c
+--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd
++++ a/fs/proc/base.c
+@@ -1652,12 +1652,46 @@ out:
+ return error;
+ }
+
++static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
++ struct kstat *stat)
++{
++ struct inode *inode = dentry->d_inode;
++ struct task_struct *task = get_proc_task(inode);
++ int rc;
++
++ if (task == NULL)
++ return -ESRCH;
++
++ rc = -EACCES;
++ if (lock_trace(task))
++ goto out_task;
++
++ generic_fillattr(inode, stat);
++ unlock_trace(task);
++ put_task_struct(task);
++ rc = 0;
++out_task:
++ return rc;
++}
++
+ static const struct inode_operations proc_pid_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_pid_follow_link,
+ .setattr = proc_setattr,
+ };
+
++static const struct inode_operations proc_fdinfo_link_inode_operations = {
++ .setattr = proc_setattr,
++ .getattr = proc_pid_fd_link_getattr,
++};
++
++static const struct inode_operations proc_fd_link_inode_operations = {
++ .readlink = proc_pid_readlink,
++ .follow_link = proc_pid_follow_link,
++ .setattr = proc_setattr,
++ .getattr = proc_pid_fd_link_getattr,
++};
++
+
+ /* building an inode */
+
+@@ -1889,49 +1923,61 @@ out:
+
+ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+ {
+- struct task_struct *task = get_proc_task(inode);
+- struct files_struct *files = NULL;
++ struct task_struct *task;
++ struct files_struct *files;
+ struct file *file;
+ int fd = proc_fd(inode);
++ int rc;
+
+- if (task) {
+- files = get_files_struct(task);
+- put_task_struct(task);
+- }
+- if (files) {
+- /*
+- * We are not taking a ref to the file structure, so we must
+- * hold ->file_lock.
+- */
+- spin_lock(&files->file_lock);
+- file = fcheck_files(files, fd);
+- if (file) {
+- unsigned int f_flags;
+- struct fdtable *fdt;
+-
+- fdt = files_fdtable(files);
+- f_flags = file->f_flags & ~O_CLOEXEC;
+- if (FD_ISSET(fd, fdt->close_on_exec))
+- f_flags |= O_CLOEXEC;
+-
+- if (path) {
+- *path = file->f_path;
+- path_get(&file->f_path);
+- }
+- if (info)
+- snprintf(info, PROC_FDINFO_MAX,
+- "pos:\t%lli\n"
+- "flags:\t0%o\n",
+- (long long) file->f_pos,
+- f_flags);
+- spin_unlock(&files->file_lock);
+- put_files_struct(files);
+- return 0;
++ task = get_proc_task(inode);
++ if (!task)
++ return -ENOENT;
++
++ rc = -EACCES;
++ if (lock_trace(task))
++ goto out_task;
++
++ rc = -ENOENT;
++ files = get_files_struct(task);
++ if (files == NULL)
++ goto out_unlock;
++
++ /*
++ * We are not taking a ref to the file structure, so we must
++ * hold ->file_lock.
++ */
++ spin_lock(&files->file_lock);
++ file = fcheck_files(files, fd);
++ if (file) {
++ unsigned int f_flags;
++ struct fdtable *fdt;
++
++ fdt = files_fdtable(files);
++ f_flags = file->f_flags & ~O_CLOEXEC;
++ if (FD_ISSET(fd, fdt->close_on_exec))
++ f_flags |= O_CLOEXEC;
++
++ if (path) {
++ *path = file->f_path;
++ path_get(&file->f_path);
+ }
+- spin_unlock(&files->file_lock);
+- put_files_struct(files);
+- }
+- return -ENOENT;
++ if (info)
++ snprintf(info, PROC_FDINFO_MAX,
++ "pos:\t%lli\n"
++ "flags:\t0%o\n",
++ (long long) file->f_pos,
++ f_flags);
++ rc = 0;
++ } else
++ rc = -ENOENT;
++ spin_unlock(&files->file_lock);
++ put_files_struct(files);
++
++out_unlock:
++ unlock_trace(task);
++out_task:
++ put_task_struct(task);
++ return rc;
+ }
+
+ static int proc_fd_link(struct inode *inode, struct path *path)
+@@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiat
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+- inode->i_op = &proc_pid_link_inode_operations;
++ inode->i_op = &proc_fd_link_inode_operations;
+ inode->i_size = 64;
+ ei->op.proc_get_link = proc_fd_link;
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+@@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_comm
+ if (fd == ~0U)
+ goto out;
+
++ result = ERR_PTR(-EACCES);
++ if (lock_trace(task))
++ goto out;
++
+ result = instantiate(dir, dentry, task, &fd);
++ unlock_trace(task);
+ out:
+ put_task_struct(task);
+ out_no_task:
+@@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct fil
+ retval = -ENOENT;
+ if (!p)
+ goto out_no_task;
++
++ retval = -EACCES;
++ if (lock_trace(p))
++ goto out;
++
+ retval = 0;
+
+ fd = filp->f_pos;
+ switch (fd) {
+ case 0:
+ if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+- goto out;
++ goto out_unlock;
+ filp->f_pos++;
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+- goto out;
++ goto out_unlock;
+ filp->f_pos++;
+ default:
+ files = get_files_struct(p);
+ if (!files)
+- goto out;
++ goto out_unlock;
+ rcu_read_lock();
+ for (fd = filp->f_pos-2;
+ fd < files_fdtable(files)->max_fds;
+@@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct fil
+ rcu_read_unlock();
+ put_files_struct(files);
+ }
++
++out_unlock:
++ unlock_trace(p);
+ out:
+ put_task_struct(p);
+ out_no_task:
+@@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instan
+ ei->fd = fd;
+ inode->i_mode = S_IFREG | S_IRUSR;
+ inode->i_fop = &proc_fdinfo_file_operations;
++ inode->i_op = &proc_fdinfo_link_inode_operations;
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ d_add(dentry, inode);
+ /* Close the race of the process dying before we return the dentry */
+_
diff --git a/kernel/proc-force-dcache-drop-on-unauthorized-access.patch b/kernel/proc-force-dcache-drop-on-unauthorized-access.patch
new file mode 100644
index 000000000..bfe6bf1a8
--- /dev/null
+++ b/kernel/proc-force-dcache-drop-on-unauthorized-access.patch
@@ -0,0 +1,118 @@
+From: Vasiliy Kulikov <segoon@openwall.com>
+
+The patch "proc: fix races against execve() of /proc/PID/fd**" is still a
+partial fix for a setxid problem. link(2) is a yet another way to
+identify whether a specific fd is opened by a privileged process. By
+calling link(2) against /proc/PID/fd/* an attacker may identify whether
+the fd number is valid for PID by analysing link(2) return code.
+
+Both getattr() and link() can be used by the attacker iff the dentry is
+present in the dcache. In this case ->lookup() is not called and the only
+way to check ptrace permissions is either operation handler or
+->revalidate(). The easiest solution to prevent any unauthorized access
+to /proc/PID/fd*/ files is to force the dentry drop on each unauthorized
+access attempt.
+
+If an attacker keeps opened fd of /proc/PID/fd/ and dcache contains a
+specific dentry for some /proc/PID/fd/XXX, any future attemp to use the
+dentry by the attacker would lead to the dentry drop as a result of a
+failed ptrace check in ->revalidate(). Then the attacker cannot spawn a
+dentry for the specific fd number because of ptrace check in ->lookup().
+
+The dentry drop can be still observed by an attacker by analysing
+information from /proc/slabinfo, which is addressed in the successive
+patch.
+
+Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
+Cc: Cyrill Gorcunov <gorcunov@gmail.com>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Christoph Lameter <cl@linux-foundation.org>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: Matt Mackall <mpm@selenic.com>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ fs/proc/base.c | 42 ++++++------------------------------------
+ 1 file changed, 6 insertions(+), 36 deletions(-)
+
+Index: linux-2.6.git/fs/proc/base.c
+===================================================================
+--- linux-2.6.git.orig/fs/proc/base.c
++++ linux-2.6.git/fs/proc/base.c
+@@ -1665,46 +1665,12 @@ out:
+ return error;
+ }
+
+-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
+- struct kstat *stat)
+-{
+- struct inode *inode = dentry->d_inode;
+- struct task_struct *task = get_proc_task(inode);
+- int rc;
+-
+- if (task == NULL)
+- return -ESRCH;
+-
+- rc = -EACCES;
+- if (lock_trace(task))
+- goto out_task;
+-
+- generic_fillattr(inode, stat);
+- unlock_trace(task);
+- rc = 0;
+-out_task:
+- put_task_struct(task);
+- return rc;
+-}
+-
+ static const struct inode_operations proc_pid_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_pid_follow_link,
+ .setattr = proc_setattr,
+ };
+
+-static const struct inode_operations proc_fdinfo_link_inode_operations = {
+- .setattr = proc_setattr,
+- .getattr = proc_pid_fd_link_getattr,
+-};
+-
+-static const struct inode_operations proc_fd_link_inode_operations = {
+- .readlink = proc_pid_readlink,
+- .follow_link = proc_pid_follow_link,
+- .setattr = proc_setattr,
+- .getattr = proc_pid_fd_link_getattr,
+-};
+-
+
+ /* building an inode */
+
+@@ -2013,6 +1979,11 @@ static int tid_fd_revalidate(struct dent
+ task = get_proc_task(inode);
+ fd = proc_fd(inode);
+
++ if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
++ put_task_struct(task);
++ task = NULL;
++ }
++
+ if (task) {
+ files = get_files_struct(task);
+ if (files) {
+@@ -2085,7 +2056,7 @@ static struct dentry *proc_fd_instantiat
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+- inode->i_op = &proc_fd_link_inode_operations;
++ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+ ei->op.proc_get_link = proc_fd_link;
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+@@ -2267,7 +2238,6 @@ static struct dentry *proc_fdinfo_instan
+ ei->fd = fd;
+ inode->i_mode = S_IFREG | S_IRUSR;
+ inode->i_fop = &proc_fdinfo_file_operations;
+- inode->i_op = &proc_fdinfo_link_inode_operations;
+ d_set_d_op(dentry, &tid_fd_dentry_operations);
+ d_add(dentry, inode);
+ /* Close the race of the process dying before we return the dentry */
diff --git a/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch b/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch
new file mode 100644
index 000000000..b65897617
--- /dev/null
+++ b/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch
@@ -0,0 +1,26 @@
+From: Pavel Emelyanov <xemul@openvz.org>
+
+On reading sysctl dirs we should return -EISDIR instead of -EINVAL.
+
+Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
+Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
+Cc: Alexey Dobriyan <adobriyan@gmail.com>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ fs/proc/proc_sysctl.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff -puN fs/proc/proc_sysctl.c~procfs-report-eisdir-when-reading-sysctl-dirs-in-proc fs/proc/proc_sysctl.c
+--- a/fs/proc/proc_sysctl.c~procfs-report-eisdir-when-reading-sysctl-dirs-in-proc
++++ a/fs/proc/proc_sysctl.c
+@@ -360,6 +360,7 @@ static const struct file_operations proc
+ };
+
+ static const struct file_operations proc_sys_dir_file_operations = {
++ .read = generic_read_dir,
+ .readdir = proc_sys_readdir,
+ .llseek = generic_file_llseek,
+ };
+_
diff --git a/kernel/readme b/kernel/readme
new file mode 100644
index 000000000..cfc32d32a
--- /dev/null
+++ b/kernel/readme
@@ -0,0 +1,5 @@
+The kernel patches series. See "series" file to obtain
+order of appliance. Not all patches do address C/R directly
+but some of them are needed due to dependencies.
+
+Has been tested on Linux 3.1-rc3.
diff --git a/kernel/series b/kernel/series
new file mode 100644
index 000000000..a41e8c2dd
--- /dev/null
+++ b/kernel/series
@@ -0,0 +1,12 @@
+cr-proc-add-children
+procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch
+proc-fix-races-against-execve-of-proc-pid-fd.patch
+proc-fix-races-against-execve-of-proc-pid-fd-fix.patch
+proc-force-dcache-drop-on-unauthorized-access.patch
+cr-statfs-callback-for-pipefs
+cr-clone-with-pid-support
+fs-proc-switch-to-dentry
+cr-proc-map-files-21
+fs-proc-add-tls
+fs-add-do-close
+binfmt-elf-for-cr-4
diff --git a/parasite-elf.lds.S b/parasite-elf.lds.S
new file mode 100644
index 000000000..83e0b40dd
--- /dev/null
+++ b/parasite-elf.lds.S
@@ -0,0 +1,19 @@
+OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+
+SECTIONS
+{
+ . = 0;
+ .text : {
+ *(.parasite.head.text)
+ *(.text)
+ . = ALIGN(8);
+ }
+ .data : {
+ *(.data)
+ *(.rodata)
+ *(.bss)
+ *(.parasite.stack)
+ . = ALIGN(8);
+ }
+}
diff --git a/parasite-syscall.c b/parasite-syscall.c
new file mode 100644
index 000000000..3752a404d
--- /dev/null
+++ b/parasite-syscall.c
@@ -0,0 +1,514 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include "compiler.h"
+#include "syscall.h"
+#include "types.h"
+#include "util.h"
+
+#include "parasite-syscall.h"
+#include "parasite-blob.h"
+#include "parasite.h"
+
+#ifdef CONFIG_X86_64
+static const char code_syscall[] = {0x0f, 0x05, 0xcc, 0xcc,
+ 0xcc, 0xcc, 0xcc, 0xcc};
+
+#define code_syscall_size (round_up(sizeof(code_syscall), sizeof(long)))
+#define parasite_size (round_up(sizeof(parasite_blob), sizeof(long)))
+
+static int syscall_fits_vma_area(struct vma_area *vma_area)
+{
+ return can_run_syscall((unsigned long)vma_area->vma.start,
+ (unsigned long)vma_area->vma.start,
+ (unsigned long)vma_area->vma.end);
+}
+
+int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end)
+{
+ return ip >= start && ip < (end - code_syscall_size);
+}
+
+void *mmap_seized(pid_t pid, user_regs_struct_t *regs,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ user_regs_struct_t params = *regs;
+ void *mmaped = NULL;
+ int ret;
+
+ params.ax = (unsigned long)__NR_mmap; /* mmap */
+ params.di = (unsigned long)addr; /* @addr */
+ params.si = (unsigned long)length; /* @length */
+ params.dx = (unsigned long)prot; /* @prot */
+ params.r10 = (unsigned long)flags; /* @flags */
+ params.r8 = (unsigned long)fd; /* @fd */
+ params.r9 = (unsigned long)offset; /* @offset */
+
+ ret = syscall_seized(pid, regs, &params, &params);
+ if (ret)
+ goto err;
+ mmaped = (void *)params.ax;
+
+ /* error code from the kernel space */
+ if ((long)mmaped < 0)
+ mmaped = NULL;
+err:
+ return mmaped;
+}
+
+int munmap_seized(pid_t pid, user_regs_struct_t *regs,
+ void *addr, size_t length)
+{
+ user_regs_struct_t params = *regs;
+ int ret;
+
+ params.ax = (unsigned long)__NR_munmap; /* mmap */
+ params.di = (unsigned long)addr; /* @addr */
+ params.si = (unsigned long)length; /* @length */
+
+ ret = syscall_seized(pid, regs, &params, &params);
+ if (!ret)
+ ret = (int)params.ax;
+
+ return ret;
+}
+
+int kill_seized(pid_t pid, user_regs_struct_t *where)
+{
+ user_regs_struct_t params = *where;
+ int ret;
+
+ params.ax = (unsigned long)__NR_exit; /* exit */
+ params.di = (unsigned long)-1; /* @error-code */
+
+ ret = syscall_seized(pid, where, &params, &params);
+
+ return ret;
+}
+
+int syscall_seized(pid_t pid,
+ user_regs_struct_t *where,
+ user_regs_struct_t *params,
+ user_regs_struct_t *result)
+{
+ user_regs_struct_t regs_orig, regs;
+ unsigned long start_ip;
+ char saved[sizeof(code_syscall)];
+ siginfo_t siginfo;
+ int status;
+ int ret = -1;
+
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+
+ start_ip = (unsigned long)where->ip;
+
+ jerr(ptrace_peek_area(pid, (void *)saved, (void *)start_ip, code_syscall_size), err);
+ jerr(ptrace_poke_area(pid, (void *)code_syscall, (void *)start_ip, code_syscall_size), err);
+
+again:
+ jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err);
+ regs_orig = regs;
+
+ regs.ip = start_ip;
+ regs.ax = params->ax;
+ regs.di = params->di;
+ regs.si = params->si;
+ regs.dx = params->dx;
+ regs.r10 = params->r10;
+ regs.r8 = params->r8;
+ regs.r9 = params->r9;
+ regs.orig_ax = -1; /* avoid end-of-syscall processing */
+
+ jerr(ptrace(PTRACE_SETREGS, pid, NULL, &regs), err_restore);
+
+ /*
+ * Most ideas are taken from Tejun Heo's parasite thread
+ * https://code.google.com/p/ptrace-parasite/
+ */
+
+ /*
+ * Run the parasite code, at the completion it'll trigger
+ * int3 and inform us that all is done.
+ */
+
+ jerr(ptrace(PTRACE_CONT, pid, NULL, NULL), err_restore_full);
+ jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full);
+ jerr(!WIFSTOPPED(status), err_restore_full);
+ jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo),err_restore_full);
+
+ jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err_restore_full);
+
+ if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) {
+retry_signal:
+ /* pr_debug("** delivering signal %d si_code=%d\n",
+ siginfo.si_signo, siginfo.si_code); */
+ /* FIXME: jerr(siginfo.si_code > 0, err_restore_full); */
+ jerr(ptrace(PTRACE_SETREGS, pid, NULL, (void *)&regs_orig), err_restore_full);
+ jerr(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), err_restore_full);
+ jerr(ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)siginfo.si_signo), err_restore_full);
+
+ jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full);
+ jerr(!WIFSTOPPED(status), err_restore_full);
+ jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo), err_restore_full);
+
+ if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP)
+ goto retry_signal;
+
+ goto again;
+ }
+
+ ret = 0;
+
+ /*
+ * Our code is done.
+ */
+ jerr(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), err_restore_full);
+ jerr(ptrace(PTRACE_CONT, pid, NULL, NULL), err_restore_full);
+
+ jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full);
+ jerr(!WIFSTOPPED(status), err_restore_full);
+ jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo), err_restore_full);
+
+ jerr((siginfo.si_code >> 8 != PTRACE_EVENT_STOP), err_restore_full);
+
+ jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err_restore_full);
+
+ ret = 0;
+ *result = regs;
+
+err_restore_full:
+ if (ptrace(PTRACE_SETREGS, pid, NULL, &regs_orig))
+ pr_panic("Can't restore registers (pid: %d)\n", pid);
+
+err_restore:
+ if (ptrace_poke_area(pid, (void *)saved, (void *)start_ip, code_syscall_size))
+ pr_panic("Crap... Can't restore data (pid: %d)\n", pid);
+err:
+ return ret;
+}
+
+static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list, unsigned long ip)
+{
+ struct vma_area *vma_area;
+
+ list_for_each_entry(vma_area, vma_area_list, list) {
+ if (in_vma_area(vma_area, ip)) {
+ if (vma_area->vma.prot & PROT_EXEC) {
+ if (syscall_fits_vma_area(vma_area))
+ return vma_area;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list,
+ struct cr_fdset *cr_fdset, int fd_type)
+{
+ parasite_args_cmd_dumppages_t parasite_dumppages = { };
+ parasite_args_t parasite_arg = { };
+
+ user_regs_struct_t regs, regs_orig;
+ unsigned long nrpages_dumped = 0;
+ struct vma_area *vma_area;
+ siginfo_t siginfo;
+ int status, path_len, ret = -1;
+
+ pr_info("\n");
+ pr_info("Dumping pages (type: %d pid: %d)\n", fd_type, ctl->pid);
+ pr_info("----------------------------------------\n");
+
+ path_len = strlen(cr_fdset->desc[fd_type].name) + 1;
+
+ if (path_len > sizeof(parasite_dumppages.open_path)) {
+ pr_panic("Dumping pages path is too long (%d while %d allowed)\n",
+ path_len, sizeof(parasite_dumppages.open_path));
+ goto err;
+ }
+
+ jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, &regs_orig), err);
+
+ parasite_arg.command = PARASITE_CMD_DUMPPAGES;
+ parasite_arg.args_size = sizeof(parasite_dumppages);
+ parasite_arg.args = &parasite_dumppages;
+
+ strncpy(parasite_dumppages.open_path, cr_fdset->desc[fd_type].name,
+ sizeof(parasite_dumppages.open_path));
+ parasite_dumppages.open_flags = O_WRONLY;
+ parasite_dumppages.open_mode = CR_FD_PERM;
+ parasite_dumppages.fd = -1UL;
+
+ /*
+ * Pass the command first, it's immutable.
+ */
+ jerr(ptrace_poke_area((long)ctl->pid, (void *)&parasite_arg.command,
+ (void *)ctl->addr_cmd, sizeof(parasite_arg.command)),
+ err_restore);
+
+ list_for_each_entry(vma_area, vma_area_list, list) {
+
+ /*
+ * The special areas are not dumped.
+ */
+ if (!(vma_area->vma.status & VMA_AREA_REGULAR))
+ continue;
+
+ /* No dumps for file-shared mappings */
+ if (vma_area->vma.status & VMA_FILE_SHARED)
+ continue;
+
+ pr_info_vma(vma_area);
+
+again:
+ jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, &regs), err_restore);
+ regs.ip = ctl->parasite_ip;
+ jerr(ptrace(PTRACE_SETREGS, ctl->pid, NULL, &regs), err_restore);
+
+ parasite_dumppages.vma_entry = vma_area->vma;
+
+ if (ptrace_poke_area((long)ctl->pid, (void *)parasite_arg.args,
+ (void *)ctl->addr_args, parasite_arg.args_size)) {
+ pr_error("Can't setup parasite arguments (pid: %d)\n", ctl->pid);
+ goto err_restore;
+ }
+
+ jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, NULL), err_restore);
+ jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore);
+ jerr(!WIFSTOPPED(status), err_restore);
+ jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore);
+
+ if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) {
+retry_signal:
+ /* pr_debug("** delivering signal %d si_code=%d\n",
+ siginfo.si_signo, siginfo.si_code); */
+ /* FIXME: jerr(siginfo.si_code > 0, err_restore_full); */
+ jerr(ptrace(PTRACE_SETREGS, (long)ctl->pid, NULL, (void *)&regs_orig), err_restore);
+ jerr(ptrace(PTRACE_INTERRUPT, (long)ctl->pid, NULL, NULL), err_restore);
+ jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, (void *)(unsigned long)siginfo.si_signo), err_restore);
+
+ jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore);
+ jerr(!WIFSTOPPED(status), err_restore);
+ jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore);
+
+ if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP)
+ goto retry_signal;
+
+ goto again;
+ }
+
+ /*
+ * It's a bit tricky, the file get opened inside
+ * parasite but close via explicit syscall. Better would
+ * be to add some 'status' and close inside parasite on
+ * last call.
+ */
+ if (parasite_dumppages.fd == -1UL) {
+ if (ptrace_peek_area((long)ctl->pid,
+ (void *)&parasite_dumppages.fd,
+ (void *)(ctl->addr_args +
+ offsetof(parasite_args_cmd_dumppages_t, fd)),
+ sizeof(parasite_dumppages.fd))) {
+ pr_error("Can't get file descriptor back (pid: %d)\n", ctl->pid);
+ goto err_restore;
+ }
+ }
+
+ /*
+ * Get some statistics.
+ */
+ if (ptrace_peek_area((long)ctl->pid,
+ (void *)&parasite_dumppages.nrpages_dumped,
+ (void *)(ctl->addr_args +
+ offsetof(parasite_args_cmd_dumppages_t, nrpages_dumped)),
+ sizeof(parasite_dumppages.fd))) {
+ pr_error("Can't get statistics (pid: %d)\n", ctl->pid);
+ goto err_restore;
+ }
+ pr_info(" (dumped: %16li pages)\n", parasite_dumppages.nrpages_dumped);
+ nrpages_dumped += parasite_dumppages.nrpages_dumped;
+ }
+
+ /*
+ * Our code is done.
+ */
+ jerr(ptrace(PTRACE_INTERRUPT, (long)ctl->pid, NULL, NULL), err_restore);
+ jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, NULL), err_restore);
+
+ jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore);
+ jerr(!WIFSTOPPED(status), err_restore);
+ jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore);
+
+ jerr((siginfo.si_code >> 8 != PTRACE_EVENT_STOP), err_restore);
+
+ jerr(ptrace(PTRACE_GETREGS, (long)ctl->pid, NULL, &regs), err_restore);
+
+ ret = 0;
+
+ /* Finally close the descriptor the parasite has opened */
+ if (parasite_dumppages.fd != -1UL) {
+ regs = regs_orig;
+ regs.ax = __NR_close; /* close */
+ regs.di = parasite_dumppages.fd; /* @fd */
+ ret = syscall_seized(ctl->pid, &regs_orig, &regs, &regs);
+ }
+
+ /*
+ * We don't know the position in file since it's updated
+ * outside of our process.
+ */
+ lseek(cr_fdset->desc[CR_FD_PAGES].fd, 0, SEEK_END);
+
+ /* Ending page */
+ write_ptr_safe(cr_fdset->desc[CR_FD_PAGES].fd, &zero_page_entry, err_restore);
+
+ pr_info("\n");
+ pr_info("Summary: %16li pages dumped\n", nrpages_dumped);
+
+err_restore:
+ if (ptrace(PTRACE_SETREGS, (long)ctl->pid, NULL, &regs_orig))
+ pr_panic("Can't restore registers (pid: %d)\n", ctl->pid);
+
+err:
+ pr_info("----------------------------------------\n");
+
+ return ret;
+}
+
+int parasite_cure_seized(struct parasite_ctl **p_ctl,
+ struct list_head *vma_area_list)
+{
+ user_regs_struct_t regs, regs_orig;
+ struct parasite_ctl *ctl;
+ struct vma_area *vma_area;
+ int ret = -1;
+
+ if (!p_ctl || !*p_ctl)
+ return 0;
+
+ ctl = *p_ctl;
+
+ jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, &regs), err);
+
+ regs_orig = regs;
+
+ vma_area = get_vma_by_ip(vma_area_list, regs.ip);
+ if (!vma_area) {
+ pr_error("No suitable VMA found to run cure (pid: %d)\n", ctl->pid);
+ goto err;
+ }
+
+ regs.ip = vma_area->vma.start;
+
+ ret = munmap_seized(ctl->pid, &regs,
+ (void *)ctl->vma_area->vma.start,
+ (size_t)vma_entry_len(&ctl->vma_area->vma));
+ if (ret)
+ pr_error("munmap_seized failed (pid: %d)\n", ctl->pid);
+
+ if (ptrace(PTRACE_SETREGS, ctl->pid, NULL, &regs_orig)) {
+ ret = -1;
+ pr_panic("PTRACE_SETREGS failed (pid: %d)\n", ctl->pid);
+ }
+
+ free(*p_ctl), *p_ctl = NULL;
+err:
+ return ret;
+}
+
+struct parasite_ctl *parasite_infect_seized(pid_t pid, void *addr_hint, struct list_head *vma_area_list)
+{
+ user_regs_struct_t regs, regs_orig;
+ struct parasite_ctl *ctl = NULL;
+ struct vma_area *vma_area;
+ void *mmaped;
+
+ ctl = xzalloc(sizeof(*ctl) + sizeof(*vma_area));
+ if (!ctl) {
+ pr_error("Parasite control block allocation failed (pid: %d)\n", pid);
+ goto err;
+ }
+
+ /* Setup control block */
+ ctl->pid = pid;
+ ctl->vma_area = (struct vma_area *)(char *)&ctl[sizeof(*ctl)];
+
+ if (ptrace(PTRACE_GETREGS, pid, NULL, &regs))
+ pr_error_jmp(err_free);
+
+ vma_area = get_vma_by_ip(vma_area_list, regs.ip);
+ if (!vma_area) {
+ pr_error("No suitable VMA found to run parasite "
+ "bootstrap code (pid: %d)\n", pid);
+ goto err_free;
+ }
+
+ regs_orig = regs;
+
+ /*
+ * Prepare for in-process syscall.
+ */
+ ctl->vma_area->vma.prot = PROT_READ | PROT_WRITE | PROT_EXEC;
+ ctl->vma_area->vma.flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+ regs.ip = vma_area->vma.start;
+
+ mmaped = mmap_seized(pid, &regs, addr_hint, (size_t)parasite_size,
+ (int)ctl->vma_area->vma.prot,
+ (int)ctl->vma_area->vma.flags,
+ (int)-1, (off_t)0);
+
+ if (!mmaped || (long)mmaped < 0) {
+ pr_error("Can't allocate memory for parasite blob (pid: %d)\n", pid);
+ goto err_restore_regs;
+ }
+
+ ctl->parasite_ip = PARASITE_HEAD_ADDR((unsigned long)mmaped);
+ ctl->parasite_complete_ip = PARASITE_COMPLETE_ADDR((unsigned long)mmaped);
+ ctl->addr_cmd = PARASITE_CMD_ADDR((unsigned long)mmaped);
+ ctl->addr_args = PARASITE_ARGS_ADDR((unsigned long)mmaped);
+
+ ctl->vma_area->vma.start= (u64)mmaped;
+ ctl->vma_area->vma.end = (u64)(mmaped + parasite_size);
+
+ if (ptrace_poke_area(pid, parasite_blob, mmaped, parasite_size)) {
+ pr_error("Can't inject parasite blob (pid: %d)\n", pid);
+ goto err_munmap_restore;
+ }
+
+ jerr(ptrace(PTRACE_SETREGS, pid, NULL, &regs_orig), err_munmap_restore);
+
+ return ctl;
+
+err_munmap_restore:
+ regs = regs_orig, regs.ip = vma_area->vma.start;
+ if (munmap_seized(pid, &regs, mmaped, parasite_size))
+ pr_panic("mmap_seized failed (pid: %d)\n", pid);
+err_restore_regs:
+ if (ptrace(PTRACE_SETREGS, pid, NULL, &regs_orig))
+ pr_panic("PTRACE_SETREGS failed (pid: %d)\n", pid);
+err_free:
+ if (ctl)
+ free(ctl);
+err:
+ return NULL;
+}
+
+#else /* CONFIG_X86_64 */
+# error x86-32 is not yet implemented
+#endif /* CONFIG_X86_64 */
diff --git a/parasite.c b/parasite.c
new file mode 100644
index 000000000..fca9cfd62
--- /dev/null
+++ b/parasite.c
@@ -0,0 +1,339 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "compiler.h"
+#include "types.h"
+#include "syscall.h"
+#include "parasite.h"
+#include "image.h"
+#include "crtools.h"
+
+#ifdef CONFIG_X86_64
+
+static void *brk_start, *brk_end, *brk_tail;
+
+static struct page_entry page;
+static struct vma_entry vma;
+
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ long d0, d1, d2;
+ asm volatile(
+ "rep ; movsq\n\t"
+ "movq %4,%%rcx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+ : "memory");
+
+ return dest;
+}
+
+static void brk_init(void *brk)
+{
+ brk_start = brk_tail = brk;
+ brk_end = brk_start + PARASITE_BRK_SIZE;
+}
+
+static void *brk_alloc(unsigned long bytes)
+{
+ void *addr = NULL;
+ if (brk_end > (brk_tail + bytes)) {
+ addr = brk_tail;
+ brk_tail+= bytes;
+ }
+ return addr;
+}
+
+static void brk_free(unsigned long bytes)
+{
+ if (brk_start >= (brk_tail - bytes))
+ brk_tail -= bytes;
+}
+
+static unsigned long builtin_strlen(char *str)
+{
+ unsigned long len = 0;
+ while (*str++)
+ len++;
+ return len;
+}
+
+static const unsigned char hex[] = "0123456789abcdef";
+static char *long2hex(unsigned long v)
+{
+ static char buf[32];
+ char *p = buf;
+ int i;
+
+ for (i = sizeof(long) - 1; i >= 0; i--) {
+ *p++ = hex[ ((((unsigned char *)&v)[i]) & 0xf0) >> 4 ];
+ *p++ = hex[ ((((unsigned char *)&v)[i]) & 0x0f) >> 0 ];
+ }
+ *p = 0;
+
+ return buf;
+}
+
+static void sys_write_msg(const char *msg)
+{
+ int size = 0;
+ while (msg[size])
+ size++;
+ sys_write(1, msg, size);
+}
+
+static int restore_core(char *corefile)
+{
+ int ret = PARASITE_ERR_FAIL;
+ int fd_core;
+
+ fd_core = (int)sys_open(corefile, O_RDONLY, 0600);
+ if (fd_core < 0) {
+ ret = PARASITE_ERR_OPEN;
+ goto err_open;
+ }
+
+ /* Skip the header */
+ sys_lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
+
+ /* First VMA areas */
+ while (1) {
+ unsigned long addr;
+
+ ret = sys_read(fd_core, &vma, sizeof(vma));
+ if (ret && ret != sizeof(vma)) {
+ ret = PARASITE_ERR_CORE_VMA;
+ goto err;
+ }
+
+ if (vma.start == 0 && vma.end == 0)
+ break;
+
+ /* Make sure it's mapped into proper place */
+ addr = sys_mmap((void *)vma.start,
+ vma.end - vma.start,
+ vma.prot,
+ vma.flags | MAP_FIXED,
+ vma.fd,
+ vma.pgoff);
+ if (addr != vma.start) {
+ ret = PARASITE_ERR_MMAP;
+ goto err;
+ }
+ }
+
+ /* Now pages */
+ while (1) {
+ unsigned long count;
+
+ ret = sys_read(fd_core, &page.va, sizeof(page.va));
+ if (ret && ret != sizeof(page.va)) {
+ ret = PARASITE_ERR_CORE_PAGE;
+ goto err;
+ }
+
+ if (page.va == 0)
+ break;
+
+ ret = sys_read(fd_core, page.data, sizeof(page.data));
+ if (ret && ret != sizeof(page.data)) {
+ ret = PARASITE_ERR_CORE_PAGE;
+ goto err;
+ }
+
+ memcpy((void *)page.va, page.data, sizeof(page.data));
+ }
+
+ ret = 0;
+
+err:
+ sys_close(fd_core);
+
+err_open:
+ return ret;
+}
+
+static int dump_pages(parasite_args_cmd_dumppages_t *args)
+{
+ int ret = PARASITE_ERR_FAIL;
+ unsigned long nrpages, pfn, length;
+ unsigned long prot_old, prot_new;
+ unsigned char *map_brk = NULL;
+ unsigned char *map;
+ bool dump_all = false;
+
+ args->nrpages_dumped = 0;
+ prot_old = prot_new = 0;
+
+ if (args->fd == -1UL) {
+ args->fd = sys_open(args->open_path, args->open_flags, args->open_mode);
+ if (args->fd < 0) {
+ sys_write_msg("sys_open failed\n");
+ ret = PARASITE_ERR_OPEN;
+ goto err;
+ }
+ }
+
+ /* Start from the end of file */
+ sys_lseek(args->fd, 0, SEEK_END);
+
+ length = args->vma_entry.end - args->vma_entry.start;
+ nrpages = length / PAGE_SIZE;
+
+ /*
+ * brk should allow us to handle up to 128M of memory,
+ * otherwise call for mmap.
+ */
+ map = brk_alloc(nrpages);
+ if (map) {
+ map_brk = map;
+ } else {
+ map = (void *)sys_mmap(NULL, nrpages,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ -1, 0);
+ if ((long)map < 0) {
+ sys_write_msg("sys_mmap failed\n");
+ ret = PARASITE_ERR_MMAP;
+ goto err;
+ }
+ }
+
+ dump_all = !!(args->vma_entry.status & VMA_DUMP_ALL);
+
+ /*
+ * Try to change page protection if needed so we would
+ * be able to dump contents.
+ */
+ if (!(args->vma_entry.prot & PROT_READ)) {
+ prot_old = (unsigned long)args->vma_entry.prot;
+ prot_new = prot_old | PROT_READ;
+ if (sys_mprotect((unsigned long)args->vma_entry.start,
+ (unsigned long)vma_entry_len(&args->vma_entry),
+ prot_new)) {
+ sys_write_msg("sys_mprotect failed\n");
+ ret = PARASITE_ERR_MPROTECT;
+ goto err_free;
+ }
+ }
+
+ /*
+ * Dumping the whole VMA range is not a common operation
+ * so stick for mincore as a basis.
+ */
+
+ if (sys_mincore((unsigned long)args->vma_entry.start, length, map)) {
+ sys_write_msg("sys_mincore failed\n");
+ ret = PARASITE_ERR_MINCORE;
+ goto err_free;
+ }
+
+ ret = 0;
+ for (pfn = 0; pfn < nrpages; pfn++) {
+ unsigned long vaddr, written;
+
+ if ((map[pfn] & PAGE_RSS) || dump_all) {
+ /*
+ * That's the optimized write of
+ * page_entry structure, see image.h
+ */
+ vaddr = (unsigned long)args->vma_entry.start + pfn * PAGE_SIZE;
+ written = 0;
+
+ written += sys_write(args->fd, &vaddr, sizeof(vaddr));
+ written += sys_write(args->fd, (void *)vaddr, PAGE_SIZE);
+ if (written != sizeof(vaddr) + PAGE_SIZE) {
+ ret = PARASITE_ERR_WRITE;
+ sys_write_msg("sys_write on page failed\n");
+ goto err_free;
+ }
+
+ args->nrpages_dumped++;
+ }
+ }
+
+ /*
+ * Don't left pages readable if they were not.
+ */
+ if (prot_old != prot_new) {
+ if (sys_mprotect((unsigned long)args->vma_entry.start,
+ (unsigned long)vma_entry_len(&args->vma_entry),
+ prot_old)) {
+ sys_write_msg("PANIC: Ouch! sys_mprotect failed on resore\n");
+ ret = PARASITE_ERR_MPROTECT;
+ goto err_free;
+ }
+ }
+
+err_free:
+ if (map_brk)
+ brk_free(nrpages);
+ else
+ sys_munmap(map, nrpages);
+err:
+ return ret;
+}
+
+static int __used parasite_service(unsigned long cmd, void *args, void *brk)
+{
+ brk_init(brk);
+
+ switch (cmd) {
+ case PARASITE_CMD_KILLME:
+ sys_close(0);
+ break;
+ case PARASITE_CMD_PINGME:
+ break;
+ case PARASITE_CMD_DUMPPAGES:
+ return dump_pages((parasite_args_cmd_dumppages_t *)args);
+ break;
+ case PARASITE_CMD_RESTORECORE:
+ return restore_core((char *)args);
+ break;
+ default:
+ sys_write_msg("Unknown command to parasite\n");
+ break;
+ }
+
+ return 0;
+}
+
+static void __parasite_head __used parasite_head(void)
+{
+ /*
+ * The linker will handle the stack allocation.
+ */
+ asm volatile("parasite_head_start: \n\t"
+ "leaq parasite_stack(%rip), %rsp \n\t"
+ "pushq $0 \n\t"
+ "movq %rsp, %rbp \n\t"
+ "movl parasite_cmd(%rip), %edi \n\t"
+ "leaq parasite_args(%rip), %rsi \n\t"
+ "leaq parasite_brk(%rip), %rdx \n\t"
+ "call parasite_service \n\t"
+ "parasite_service_complete: \n\t"
+ "int $0x03 \n\t"
+ ".align 8 \n\t"
+ "parasite_cmd: \n\t"
+ ".long 0 \n\t"
+ "parasite_args: \n\t"
+ ".long 0 \n\t"
+ ".skip "__stringify(PARASITE_ARG_SIZE)",0 \n\t"
+ ".skip "__stringify(PARASITE_STACK_SIZE)", 0 \n\t"
+ "parasite_stack: \n\t"
+ ".long 0 \n\t"
+ "parasite_brk: \n\t"
+ ".skip "__stringify(PARASITE_BRK_SIZE)", 0 \n\t"
+ ".long 0 \n\t");
+}
+
+#else /* CONFIG_X86_64 */
+# error x86-32 bit mode not yet implemented
+#endif /* CONFIG_X86_64 */
diff --git a/parasite.lds.S b/parasite.lds.S
new file mode 100644
index 000000000..0f3aa327c
--- /dev/null
+++ b/parasite.lds.S
@@ -0,0 +1,19 @@
+OUTPUT_FORMAT("binary")
+OUTPUT_ARCH(i386:x86-64)
+
+SECTIONS
+{
+ . = 0;
+ .text : {
+ *(.parasite.head.text)
+ *(.text)
+ . = ALIGN(8);
+ }
+ .data : {
+ *(.data)
+ *(.rodata)
+ *(.bss)
+ *(.parasite.stack)
+ . = ALIGN(8);
+ }
+}
diff --git a/rbtree.c b/rbtree.c
new file mode 100644
index 000000000..bfaf22674
--- /dev/null
+++ b/rbtree.c
@@ -0,0 +1,322 @@
+/*
+ * RBtree implementation adopted from the Linux
+ * kernel sources.
+ */
+
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *right = node->rb_right;
+ struct rb_node *parent = rb_parent(node);
+
+ if ((node->rb_right = right->rb_left))
+ rb_set_parent(right->rb_left, node);
+ right->rb_left = node;
+
+ rb_set_parent(right, parent);
+
+ if (parent) {
+ if (node == parent->rb_left)
+ parent->rb_left = right;
+ else
+ parent->rb_right = right;
+ } else
+ root->rb_node = right;
+ rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *left = node->rb_left;
+ struct rb_node *parent = rb_parent(node);
+
+ if ((node->rb_left = left->rb_right))
+ rb_set_parent(left->rb_right, node);
+ left->rb_right = node;
+
+ rb_set_parent(left, parent);
+
+ if (parent) {
+ if (node == parent->rb_right)
+ parent->rb_right = left;
+ else
+ parent->rb_left = left;
+ } else
+ root->rb_node = left;
+ rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *parent, *gparent;
+
+ while ((parent = rb_parent(node)) && rb_is_red(parent)) {
+ gparent = rb_parent(parent);
+
+ if (parent == gparent->rb_left) {
+ {
+ register struct rb_node *uncle = gparent->rb_right;
+ if (uncle && rb_is_red(uncle)) {
+ rb_set_black(uncle);
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_right == node) {
+ register struct rb_node *tmp;
+ __rb_rotate_left(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ __rb_rotate_right(gparent, root);
+ } else {
+ {
+ register struct rb_node *uncle = gparent->rb_left;
+ if (uncle && rb_is_red(uncle)) {
+ rb_set_black(uncle);
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_left == node) {
+ register struct rb_node *tmp;
+ __rb_rotate_right(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ __rb_rotate_left(gparent, root);
+ }
+ }
+
+ rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+ struct rb_root *root)
+{
+ struct rb_node *other;
+
+ while ((!node || rb_is_black(node)) && node != root->rb_node) {
+ if (parent->rb_left == node) {
+ other = parent->rb_right;
+ if (rb_is_red(other)) {
+ rb_set_black(other);
+ rb_set_red(parent);
+ __rb_rotate_left(parent, root);
+ other = parent->rb_right;
+ }
+ if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+ (!other->rb_right || rb_is_black(other->rb_right))) {
+ rb_set_red(other);
+ node = parent;
+ parent = rb_parent(node);
+ } else {
+ if (!other->rb_right || rb_is_black(other->rb_right)) {
+ rb_set_black(other->rb_left);
+ rb_set_red(other);
+ __rb_rotate_right(other, root);
+ other = parent->rb_right;
+ }
+ rb_set_color(other, rb_color(parent));
+ rb_set_black(parent);
+ rb_set_black(other->rb_right);
+ __rb_rotate_left(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ } else {
+ other = parent->rb_left;
+ if (rb_is_red(other)) {
+ rb_set_black(other);
+ rb_set_red(parent);
+ __rb_rotate_right(parent, root);
+ other = parent->rb_left;
+ }
+ if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+ (!other->rb_right || rb_is_black(other->rb_right))) {
+ rb_set_red(other);
+ node = parent;
+ parent = rb_parent(node);
+ } else {
+ if (!other->rb_left || rb_is_black(other->rb_left)) {
+ rb_set_black(other->rb_right);
+ rb_set_red(other);
+ __rb_rotate_left(other, root);
+ other = parent->rb_left;
+ }
+ rb_set_color(other, rb_color(parent));
+ rb_set_black(parent);
+ rb_set_black(other->rb_left);
+ __rb_rotate_right(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ }
+ }
+ if (node)
+ rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *child, *parent;
+ int color;
+
+ if (!node->rb_left)
+ child = node->rb_right;
+ else if (!node->rb_right)
+ child = node->rb_left;
+ else {
+ struct rb_node *old = node, *left;
+
+ node = node->rb_right;
+ while ((left = node->rb_left) != NULL)
+ node = left;
+
+ if (rb_parent(old)) {
+ if (rb_parent(old)->rb_left == old)
+ rb_parent(old)->rb_left = node;
+ else
+ rb_parent(old)->rb_right = node;
+ } else
+ root->rb_node = node;
+
+ child = node->rb_right;
+ parent = rb_parent(node);
+ color = rb_color(node);
+
+ if (parent == old) {
+ parent = node;
+ } else {
+ if (child)
+ rb_set_parent(child, parent);
+ parent->rb_left = child;
+
+ node->rb_right = old->rb_right;
+ rb_set_parent(old->rb_right, node);
+ }
+
+ node->rb_parent_color = old->rb_parent_color;
+ node->rb_left = old->rb_left;
+ rb_set_parent(old->rb_left, node);
+
+ goto color;
+ }
+
+ parent = rb_parent(node);
+ color = rb_color(node);
+
+ if (child)
+ rb_set_parent(child, parent);
+ if (parent) {
+ if (parent->rb_left == node)
+ parent->rb_left = child;
+ else
+ parent->rb_right = child;
+ } else
+ root->rb_node = child;
+
+ color:
+ if (color == RB_BLACK)
+ __rb_erase_color(child, parent, root);
+}
+
+struct rb_node *rb_first(const struct rb_root *root)
+{
+ struct rb_node *n;
+
+ n = root->rb_node;
+ if (!n)
+ return NULL;
+ while (n->rb_left)
+ n = n->rb_left;
+ return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+ struct rb_node *n;
+
+ n = root->rb_node;
+ if (!n)
+ return NULL;
+ while (n->rb_right)
+ n = n->rb_right;
+ return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (rb_parent(node) == node)
+ return NULL;
+
+ if (node->rb_right) {
+ node = node->rb_right;
+ while (node->rb_left)
+ node=node->rb_left;
+ return (struct rb_node *)node;
+ }
+
+ while ((parent = rb_parent(node)) && node == parent->rb_right)
+ node = parent;
+
+ return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (rb_parent(node) == node)
+ return NULL;
+
+ if (node->rb_left) {
+ node = node->rb_left;
+ while (node->rb_right)
+ node=node->rb_right;
+ return (struct rb_node *)node;
+ }
+
+ while ((parent = rb_parent(node)) && node == parent->rb_left)
+ node = parent;
+
+ return parent;
+}
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root)
+{
+ struct rb_node *parent = rb_parent(victim);
+
+ if (parent) {
+ if (victim == parent->rb_left)
+ parent->rb_left = new;
+ else
+ parent->rb_right = new;
+ } else {
+ root->rb_node = new;
+ }
+ if (victim->rb_left)
+ rb_set_parent(victim->rb_left, new);
+ if (victim->rb_right)
+ rb_set_parent(victim->rb_right, new);
+
+ *new = *victim;
+}
diff --git a/testee-static.c b/testee-static.c
new file mode 100644
index 000000000..39b764d9b
--- /dev/null
+++ b/testee-static.c
@@ -0,0 +1,112 @@
+/*
+ * A simple testee program
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#include <sched.h>
+
+int main(int argc, char *argv[])
+{
+// int pipefd[2];
+ int fd_shared, fd_private;
+ const char data_mark[] = "This is a data_mark marker";
+ void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable;
+ void *mmap_anon_shared;
+ const char sep[] = "----------";
+ unsigned long buf;
+ int i;
+
+ (void)data_mark;
+
+ printf("%s pid %d\n", argv[0], getpid());
+
+// if (pipe(pipefd)) {
+// perror("Can't create pipe");
+// goto err;
+// }
+
+ fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
+ if (fd_shared < 0) {
+ perror("Can't open fd_shared file");
+ goto err;
+ }
+
+ fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
+ if (fd_private < 0) {
+ perror("Can't open fd_private file");
+ goto err;
+ }
+
+ if (lseek(fd_shared, 1024, SEEK_SET) == -1 ||
+ lseek(fd_private, 1024, SEEK_SET) == -1) {
+ perror("Can't llsek");
+ goto err;
+ }
+
+ write(fd_shared, "", 1);
+ write(fd_private, "", 1);
+
+ mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0);
+ mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0);
+ mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ mmap_anon_shared= mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+
+ if (mmap_shared == MAP_FAILED ||
+ mmap_private == MAP_FAILED ||
+ mmap_anon_shared == MAP_FAILED ||
+ mmap_anon == MAP_FAILED ||
+ map_unreadable == MAP_FAILED) {
+
+ perror("mmap failed");
+ goto err;
+ }
+
+ strcpy((char *)mmap_shared, sep);
+ strcpy((char *)mmap_private, sep);
+ strcpy((char *)mmap_anon, sep);
+ strcpy((char *)map_unreadable, sep);
+ strcpy((char *)mmap_anon_shared,sep);
+
+ for (i = 64; i < 128; i++) {
+ ((char *)mmap_shared)[i] = 0 + i;
+ ((char *)mmap_private)[i] = 64 + i;
+ ((char *)mmap_anon)[i] = 128 + i;
+ ((char *)map_unreadable)[i] = 190 + i;
+ ((char *)mmap_anon_shared)[i] = 0 + i;
+ }
+
+ if (mprotect(map_unreadable, 1024, PROT_NONE)) {
+ perror("mprotect failed");
+ goto err;
+ }
+
+ asm volatile("" ::: "memory");
+
+ fsync(fd_shared);
+ fsync(fd_private);
+
+ sync();
+ asm volatile("" ::: "memory");
+
+ while (1) {
+ printf("ping: %d\n", getpid());
+// write(pipefd[1], &buf, sizeof(buf));
+ sleep(6);
+ }
+
+err:
+ /* resources are released by kernel */
+ return 0;
+}
diff --git a/testee-threads.c b/testee-threads.c
new file mode 100644
index 000000000..cacc1eb9a
--- /dev/null
+++ b/testee-threads.c
@@ -0,0 +1,74 @@
+/*
+ * A simple testee program with threads
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+
+static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+static int counter;
+
+static void *f1(void *arg)
+{
+ void *map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ (void)map_unreadable;
+
+ while (1) {
+ pthread_mutex_lock(&mtx);
+
+ counter++;
+ /* printf("Counter value: %d\n", counter); */
+
+ pthread_mutex_unlock(&mtx);
+ sleep(2);
+ }
+
+ return NULL;
+}
+
+static void *f2(void *arg)
+{
+ void *map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ (void)map_unreadable;
+
+ while (1) {
+ pthread_mutex_lock(&mtx);
+
+ counter++;
+ /* printf("Counter value: %d\n", counter); */
+
+ pthread_mutex_unlock(&mtx);
+ sleep(3);
+ }
+
+ return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+ pthread_t th1, th2;
+ int rc1, rc2;
+
+ printf("%s pid %d\n", argv[0], getpid());
+
+ rc1 = pthread_create(&th1, NULL, &f1, NULL);
+ rc2 = pthread_create(&th2, NULL, &f2, NULL);
+
+ if (rc1 | rc2)
+ exit(1);
+
+ pthread_join(th1, NULL);
+ pthread_join(th2, NULL);
+
+ exit(0);
+}
diff --git a/testee-unlinked.c b/testee-unlinked.c
new file mode 100644
index 000000000..7287f52c8
--- /dev/null
+++ b/testee-unlinked.c
@@ -0,0 +1,92 @@
+/*
+ * A simple testee program
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#include <sched.h>
+
+int main(int argc, char *argv[])
+{
+ int fd_shared, fd_private;
+ const char data_mark[] = "This is a data_mark marker";
+ void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable;
+ const char sep[] = "----------";
+ pid_t pid, child;
+ int i;
+
+ printf("%s pid %d\n", argv[0], getpid());
+
+ fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
+ if (fd_shared < 0) {
+ perror("Can't open fd_shared file");
+ goto err;
+ }
+
+ fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
+ if (fd_private < 0) {
+ perror("Can't open fd_private file");
+ goto err;
+ }
+
+ if (lseek(fd_shared, 1024, SEEK_SET) == -1 ||
+ lseek(fd_private, 1024, SEEK_SET) == -1) {
+ perror("Can't llsek");
+ goto err;
+ }
+
+ write(fd_shared, "", 1);
+ write(fd_private, "", 1);
+
+ mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0);
+ mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0);
+
+ if (mmap_shared == MAP_FAILED ||
+ mmap_private == MAP_FAILED) {
+
+ perror("mmap failed");
+ goto err;
+ }
+
+ strcpy((char *)mmap_shared, sep);
+ strcpy((char *)mmap_private, sep);
+
+ for (i = 64; i < 128; i++) {
+ ((char *)mmap_shared)[i] = 0 + i;
+ ((char *)mmap_private)[i] = 64 + i;
+ }
+
+ fsync(fd_shared);
+ fsync(fd_private);
+
+ close(fd_shared);
+ fsync(fd_private);
+
+ unlink("testee-shared.img");
+ unlink("testee-private.img");
+
+ for (i = 64; i < 128; i++) {
+ ((char *)mmap_shared)[i] = 0 + i;
+ ((char *)mmap_private)[i] = 64 + i;
+ }
+
+ msync(mmap_shared, 1024, MS_SYNC);
+ msync(mmap_private, 1024, MS_SYNC);
+
+ while (1)
+ sleep(1);
+
+err:
+ /* resources are released by kernel */
+ return 0;
+}
diff --git a/testee.c b/testee.c
new file mode 100644
index 000000000..b65fdbbbe
--- /dev/null
+++ b/testee.c
@@ -0,0 +1,231 @@
+/*
+ * A simple testee program
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <string.h>
+
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#include <sched.h>
+
+static int do_child(void *arg)
+{
+ printf("do_child pid: %d\n", getpid());
+
+ void *stack, *mmap_anon;
+
+ stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0);
+ if (stack == MAP_FAILED)
+ return -1;
+
+ mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (mmap_anon == MAP_FAILED)
+ return -1;
+
+ while (1)
+ sleep(6);
+
+ return 0;
+}
+
+static int run_clone(void)
+{
+ pid_t pid = 0;
+ int ret = 0;
+ void *stack, *mmap_anon;
+
+ stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0);
+ if (stack == MAP_FAILED)
+ return -1;
+
+ mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (mmap_anon == MAP_FAILED)
+ return -1;
+
+ stack += 4 * 4096;
+
+ ret = clone(do_child, stack, CLONE_FS, NULL, NULL, NULL, &pid);
+ if (ret < 0)
+ perror("Failed clone");
+
+ printf("run_clone: %d stack: %p mmap_anon: %p ret %d\n",
+ pid, stack, mmap_anon, ret);
+
+ if (stack == MAP_FAILED)
+ return -1;
+
+ mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (mmap_anon == MAP_FAILED)
+ return -1;
+
+ stack += 4 * 4096;
+
+ ret = clone(do_child, stack, CLONE_FS | CLONE_FILES | CLONE_VM, NULL, NULL, NULL, &pid);
+ if (ret < 0)
+ perror("Failed clone");
+
+ printf("run_clone: %d stack: %p mmap_anon: %p ret %d\n",
+ pid, stack, mmap_anon, ret);
+
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+// int pipefd[2];
+ int fd_shared, fd_private;
+ const char data_mark[] = "This is a data_mark marker";
+ void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable;
+ const char sep[] = "----------";
+ pid_t pid, child;
+ char suided_path[128];
+ int i;
+
+ (void)data_mark;
+
+ printf("%s pid %d\n", argv[0], getpid());
+
+// if (pipe(pipefd)) {
+// perror("Can't create pipe");
+// goto err;
+// }
+
+ fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
+ if (fd_shared < 0) {
+ perror("Can't open fd_shared file");
+ goto err;
+ }
+
+ fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
+ if (fd_private < 0) {
+ perror("Can't open fd_private file");
+ goto err;
+ }
+
+ if (lseek(fd_shared, 1024, SEEK_SET) == -1 ||
+ lseek(fd_private, 1024, SEEK_SET) == -1) {
+ perror("Can't llsek");
+ goto err;
+ }
+
+ write(fd_shared, "", 1);
+ write(fd_private, "", 1);
+
+ mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0);
+ mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0);
+ mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (mmap_shared == MAP_FAILED ||
+ mmap_private == MAP_FAILED ||
+ mmap_anon == MAP_FAILED ||
+ map_unreadable == MAP_FAILED) {
+
+ perror("mmap failed");
+ goto err;
+ }
+
+ snprintf(suided_path, sizeof(suided_path),
+ "/proc/%d/map_files/%lx-%lx",
+ getpid(), (long)mmap_shared,
+ (long)mmap_shared + 0x1000);
+
+ strcpy((char *)mmap_shared, sep);
+ strcpy((char *)mmap_private, sep);
+ strcpy((char *)mmap_anon, sep);
+ strcpy((char *)map_unreadable, sep);
+
+ for (i = 64; i < 128; i++) {
+ ((char *)mmap_shared)[i] = 0 + i;
+ ((char *)mmap_private)[i] = 64 + i;
+ ((char *)mmap_anon)[i] = 128 + i;
+ ((char *)map_unreadable)[i] = 190 + i;
+ }
+
+ if (mprotect(map_unreadable, 1024, PROT_NONE)) {
+ perror("mprotect failed");
+ goto err;
+ }
+
+ asm volatile("" ::: "memory");
+
+ fsync(fd_shared);
+ fsync(fd_private);
+
+ close(fd_shared);
+
+ if (argc > 1) {
+
+ printf("my-uid: %d\n", getuid());
+ setuid(atoi(argv[1]));
+ printf("my-uid: %d\n", getuid());
+ }
+
+ fd_shared = open(suided_path, O_RDWR, 0600);
+ printf("fd_shared for O_RDWR: %d\n", fd_shared);
+ if (fd_shared >= 0) {
+ write(fd_shared, "aaaa", sizeof("aaaa"));
+ close(fd_shared);
+ }
+
+ fd_shared = open(suided_path, O_TRUNC, 0600);
+ printf("fd_shared for O_TRUNC: %d\n", fd_shared);
+ if (fd_shared >= 0) {
+ printf("tunc: %d\n", ftruncate(fd_shared, 512));
+ close(fd_shared);
+ }
+
+ fd_shared = open(suided_path, O_RDONLY, 0600);
+ printf("fd_shared for O_RDONLY: %d\n", fd_shared);
+ if (fd_shared >= 0)
+ close(fd_shared);
+
+ sync();
+ asm volatile("" ::: "memory");
+
+ pid = fork();
+ if (pid == -1)
+ goto err;
+
+ if (pid == 0) {
+ long buf;
+ child = fork();
+ if (child == -1)
+ goto err;
+ if (child == 0) {
+ printf("first child pid: %d\n", getpid());
+// while (read(pipefd[0], &buf, sizeof(buf)) > 0)
+// sleep(3);
+ while (1) {
+ printf("ping: %d\n", getpid());
+ sleep(8);
+ }
+ } else {
+ printf("first parent pid: %d\n", getpid());
+// run_clone();
+ while (1) {
+ printf("ping: %d\n", getpid());
+ sleep(9);
+ }
+ }
+ } else {
+ long buf = 0xdeadbeef;
+ while (1) {
+ printf("ping: %d\n", getpid());
+// write(pipefd[1], &buf, sizeof(buf));
+ sleep(10);
+ }
+ }
+
+err:
+ /* resources are released by kernel */
+ return 0;
+}
diff --git a/util.c b/util.c
new file mode 100644
index 000000000..243d10743
--- /dev/null
+++ b/util.c
@@ -0,0 +1,412 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <limits.h>
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <fcntl.h>
+
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+
+#include "compiler.h"
+#include "types.h"
+#include "list.h"
+#include "util.h"
+
+#include "crtools.h"
+
+static char big_buffer[PATH_MAX];
+
+void printk(const char *format, ...)
+{
+ va_list params;
+
+ va_start(params, format);
+ vfprintf(stdout, format, params);
+ va_end(params);
+}
+
+int ptrace_show_area_r(pid_t pid, void *addr, long bytes)
+{
+ unsigned long w, i;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *a = addr;
+ unsigned long v;
+ v = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
+ if (v == -1U && errno)
+ goto err;
+ else {
+ unsigned char *c = (unsigned char *)&v;
+ for (i = sizeof(v)/sizeof(*c); i > 0; i--)
+ printk("%02x ", c[i - 1]);
+ printk(" ");
+ }
+ }
+ printk("\n");
+ return 0;
+err:
+ return -2;
+}
+
+int ptrace_show_area(pid_t pid, void *addr, long bytes)
+{
+ unsigned long w, i;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ printk("%016lx: ", (unsigned long)addr);
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *a = addr;
+ unsigned long v;
+ v = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
+ if (v == -1U && errno)
+ goto err;
+ else {
+ unsigned char *c = (unsigned char *)&v;
+ for (i = 0; i < sizeof(v)/sizeof(*c); i++)
+ printk("%02x ", c[i]);
+ printk(" ");
+ }
+ }
+ printk("\n");
+ return 0;
+err:
+ return -2;
+}
+
+int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes)
+{
+ unsigned long w;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *d = dst, *a = addr;
+ d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
+ if (d[w] == -1U && errno)
+ goto err;
+ }
+ return 0;
+err:
+ return -2;
+}
+
+int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes)
+{
+ unsigned long w;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *s = src, *a = addr;
+ if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w]))
+ goto err;
+ }
+ return 0;
+err:
+ return -2;
+}
+
+void printk_registers(user_regs_struct_t *regs)
+{
+ printk("ip : %16lx cs : %16lx ds : %16lx\n"
+ "es : %16lx fs : %16lx gs : %16lx\n"
+ "sp : %16lx ss : %16lx flags : %16lx\n"
+ "ax : %16lx cx : %16lx dx : %16lx\n"
+ "si : %16lx di : %16lx bp : %16lx\n"
+ "bx : %16lx r8 : %16lx r9 : %16lx\n"
+ "r10 : %16lx r11 : %16lx r12 : %16lx\n"
+ "r13 : %16lx r14 : %16lx r15 : %16lx\n"
+ "orig_ax: %16lx fs_base: %16lx gs_base: %16lx\n\n",
+ regs->ip, regs->cs, regs->ds,
+ regs->es, regs->fs, regs->gs,
+ regs->sp, regs->ss, regs->flags,
+ regs->ax, regs->cx, regs->dx,
+ regs->si, regs->di, regs->bp,
+ regs->bx, regs->r8, regs->r9,
+ regs->r10, regs->r11, regs->r12,
+ regs->r13, regs->r14, regs->r15,
+ regs->orig_ax, regs->fs_base, regs->gs_base);
+}
+
+void printk_siginfo(siginfo_t *siginfo)
+{
+ printk("si_signo %d si_errno %d si_code %d\n",
+ siginfo->si_signo, siginfo->si_errno, siginfo->si_code);
+}
+
+void printk_vma(struct vma_area *vma_area)
+{
+ if (!vma_area)
+ return;
+
+ printk("s: %16lx e: %16lx l: %4liK p: %4x f: %4x fd: %4d pid: %4d dev:%02x:%02x:%04lx vf: %s st: %s spc: %s\n",
+ vma_area->vma.start, vma_area->vma.end,
+ (vma_area->vma.end - vma_area->vma.start) >> 10,
+ vma_area->vma.prot,
+ vma_area->vma.flags,
+ vma_area->vma.fd,
+ vma_area->vma.pid,
+ vma_area->vma.dev_maj,
+ vma_area->vma.dev_min,
+ vma_area->vma.ino,
+ vma_area->vm_file_fd < 0 ? "n" : "y",
+ !vma_area->vma.status ? "--" :
+ ((vma_area->vma.status & VMA_FILE_PRIVATE) ? "FP" :
+ ((vma_area->vma.status & VMA_FILE_SHARED) ? "FS" :
+ ((vma_area->vma.status & VMA_ANON_SHARED) ? "AS" :
+ ((vma_area->vma.status & VMA_ANON_PRIVATE) ? "AP" : "--")))),
+ !vma_area->vma.status ? "--" :
+ ((vma_area->vma.status & VMA_AREA_STACK) ? "stack" :
+ ((vma_area->vma.status & VMA_AREA_VSYSCALL) ? "vsyscall" :
+ ((vma_area->vma.status & VMA_AREA_VDSO) ? "vdso" : "n"))));
+}
+
+int unseize_task(pid_t pid)
+{
+ return ptrace(PTRACE_DETACH, pid, NULL, NULL);
+}
+
+int seize_task(pid_t pid)
+{
+ siginfo_t si;
+ int status;
+ int ret = 0;
+
+ jerr_rc(ptrace(PTRACE_SEIZE, pid, NULL,
+ (void *)(unsigned long)PTRACE_SEIZE_DEVEL), ret, err);
+ jerr_rc(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), ret, err);
+
+ ret = -10;
+ if (wait4(pid, &status, __WALL, NULL) != pid)
+ goto err;
+
+ ret = -20;
+ if (!WIFSTOPPED(status))
+ goto err;
+
+ jerr_rc(ptrace(PTRACE_GETSIGINFO, pid, NULL, &si), ret, err_cont);
+
+ ret = -30;
+ if ((si.si_code >> 8) != PTRACE_EVENT_STOP)
+ goto err_cont;
+
+ jerr_rc(ptrace(PTRACE_SETOPTIONS, pid, NULL,
+ (void *)(unsigned long)PTRACE_O_TRACEEXIT), ret, err_cont);
+
+err:
+ return ret;
+
+err_cont:
+ continue_task(pid);
+ goto err;
+}
+
+int reopen_fd_as(int new_fd, int old_fd)
+{
+ if (old_fd != new_fd) {
+ int tmp = dup2(old_fd, new_fd);
+ if (tmp < 0)
+ return tmp;
+ close(old_fd);
+ }
+
+ return new_fd;
+}
+
+int parse_maps(pid_t pid, struct list_head *vma_area_list)
+{
+ struct vma_area *vma_area = NULL;
+ u64 start, end, pgoff;
+ char map_files_path[64];
+ char maps_path[64];
+ unsigned long ino;
+ char r,w,x,s;
+ int dev_maj, dev_min;
+ int ret = -1;
+
+ DIR *map_files_dir = NULL;
+ FILE *maps = NULL;
+
+ snprintf(maps_path, sizeof(maps_path), "/proc/%d/maps", pid);
+ maps = fopen(maps_path, "r");
+ if (!maps) {
+ pr_perror("Can't open: %s\n", maps_path);
+ goto err;
+ }
+
+ snprintf(map_files_path, sizeof(map_files_path),
+ "/proc/%d/map_files", pid);
+
+ /*
+ * It might be a problem in kernel, either
+ * I'm debugging it on old kernel ;)
+ */
+ map_files_dir = opendir(map_files_path);
+ if (!map_files_dir)
+ pr_warning("Crap, can't open %s, old kernel?\n",
+ map_files_path);
+
+ while (fgets(big_buffer, sizeof(big_buffer), maps)) {
+ char vma_file_path[16+16+2];
+ struct stat st_buf;
+
+ ret = sscanf(big_buffer, "%lx-%lx %c%c%c%c %lx %02x:%02x %lu",
+ &start, &end, &r, &w, &x, &s, &pgoff, &dev_maj,
+ &dev_min, &ino);
+ if (ret != 10) {
+ pr_error("Can't parse: %s", big_buffer);
+ return -1;
+ }
+
+ vma_area = alloc_vma_area();
+ if (!vma_area)
+ return -1;
+
+ /* Figure out if it's file mapping */
+ snprintf(vma_file_path, sizeof(vma_file_path), "%lx-%lx", start, end);
+
+ if (map_files_dir) {
+ /*
+ * Note that we "open" it in dumper process space
+ * so later we might refer to it via /proc/self/fd/vm_file_fd
+ * if needed.
+ */
+ vma_area->vm_file_fd = openat(dirfd(map_files_dir),
+ vma_file_path, O_RDONLY);
+ if (vma_area->vm_file_fd < 0) {
+ if (errno != ENOENT) {
+ pr_perror("Failed opening %s/%s\n",
+ map_files_path,
+ vma_file_path);
+ goto err;
+ }
+ }
+ }
+
+ vma_area->vma.pid = pid;
+ vma_area->vma.start = start;
+ vma_area->vma.end = end;
+ vma_area->vma.pgoff = pgoff;
+
+ vma_area->vma.ino = ino;
+ vma_area->vma.dev_maj = dev_maj;
+ vma_area->vma.dev_min = dev_min;
+
+ vma_area->vma.prot = PROT_NONE;
+
+ if (r == 'r')
+ vma_area->vma.prot |= PROT_READ;
+ if (w == 'w')
+ vma_area->vma.prot |= PROT_WRITE;
+ if (x == 'x')
+ vma_area->vma.prot |= PROT_EXEC;
+
+ if (s == 's')
+ vma_area->vma.flags = MAP_SHARED;
+ else if (s == 'p')
+ vma_area->vma.flags = MAP_PRIVATE;
+
+ vma_area->vma.status = 0;
+
+ if (strstr(big_buffer, "[stack]"))
+ vma_area->vma.status |= VMA_AREA_REGULAR | VMA_AREA_STACK;
+ else if (strstr(big_buffer, "[vsyscall]"))
+ vma_area->vma.status |= VMA_AREA_VSYSCALL;
+ else if (strstr(big_buffer, "[vdso]"))
+ vma_area->vma.status |= VMA_AREA_VDSO;
+ else if (strstr(big_buffer, "[heap]"))
+ vma_area->vma.status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
+ else
+ vma_area->vma.status = VMA_AREA_REGULAR;
+
+ /*
+ * Some mapping hints for restore, we save this on
+ * disk and restore might need to analyze it.
+ */
+ if (vma_area->vm_file_fd >= 0) {
+
+ if (fstat(vma_area->vm_file_fd, &st_buf) < 0) {
+ pr_perror("Failed fstat on %s%s\n",
+ map_files_path,
+ vma_file_path);
+ goto err;
+ }
+ if (!S_ISREG(st_buf.st_mode)) {
+ pr_error("Can't handle non-regular "
+ "mapping on %s%s\n",
+ map_files_path,
+ vma_file_path);
+ goto err;
+ }
+
+ /*
+ * /dev/zero stands for anon-shared mapping
+ * otherwise it's some file mapping.
+ */
+ if (MAJOR(st_buf.st_dev) == 0) {
+ if (!(vma_area->vma.flags & MAP_SHARED))
+ goto err_bogus_mapping;
+ vma_area->vma.status |= VMA_ANON_SHARED;
+ vma_area->shmid = st_buf.st_ino;
+ } else {
+ if (vma_area->vma.flags & MAP_PRIVATE)
+ vma_area->vma.status |= VMA_FILE_PRIVATE;
+ else
+ vma_area->vma.status |= VMA_FILE_SHARED;
+ }
+ } else {
+ /*
+ * No file but mapping -- anonymous one.
+ */
+ if (vma_area->vma.flags & MAP_SHARED)
+ goto err_bogus_mapping;
+ else
+ vma_area->vma.status |= VMA_ANON_PRIVATE;
+ }
+
+ list_add_tail(&vma_area->list, vma_area_list);
+ }
+
+ vma_area = NULL;
+ ret = 0;
+
+err:
+ if (maps)
+ fclose(maps);
+
+ if (map_files_dir)
+ closedir(map_files_dir);
+
+ xfree(vma_area);
+ return ret;
+
+err_bogus_mapping:
+ pr_error("Bogus mapping %lx-%lx\n",
+ vma_area->vma.start,
+ vma_area->vma.end);
+ goto err;
+}
diff --git a/xemul/0003-Image-dumping-via-proc-file.patch b/xemul/0003-Image-dumping-via-proc-file.patch
new file mode 100644
index 000000000..8e40b874c
--- /dev/null
+++ b/xemul/0003-Image-dumping-via-proc-file.patch
@@ -0,0 +1,562 @@
+From f7e9d28188e7e2fd0f13f2696f29f20d784cb8fd Mon Sep 17 00:00:00 2001
+From: root <root@ovzept.sw.ru>
+Date: Fri, 3 Jun 2011 18:16:10 +0400
+Subject: [PATCH] Image dumping via proc file
+
+---
+ fs/proc/Kconfig | 8
+ fs/proc/Makefile | 1
+ fs/proc/base.c | 3
+ fs/proc/img_dump.c | 397 +++++++++++++++++++++++++++++++++++++++++++++
+ include/linux/binfmt_img.h | 87 +++++++++
+ include/linux/proc_fs.h | 2
+ 6 files changed, 498 insertions(+)
+ create mode 100644 fs/proc/img_dump.c
+ create mode 100644 include/linux/binfmt_img.h
+
+Index: linux-2.6.git/fs/proc/Kconfig
+===================================================================
+--- linux-2.6.git.orig/fs/proc/Kconfig
++++ linux-2.6.git/fs/proc/Kconfig
+@@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR
+ /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+ /proc/kpagecount, and /proc/kpageflags. Disabling these
+ interfaces will reduce the size of the kernel by approximately 4kb.
++
++config PROC_IMG
++ default y
++ depends on PROC_FS
++ bool "Enable /proc/<pid>/dump file"
++ help
++ Say Y here if you want to be able to produce checkpoint-restore images
++ for tasks via proc
+Index: linux-2.6.git/fs/proc/Makefile
+===================================================================
+--- linux-2.6.git.orig/fs/proc/Makefile
++++ linux-2.6.git/fs/proc/Makefile
+@@ -28,3 +28,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o
+ proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
+ proc-$(CONFIG_PRINTK) += kmsg.o
+ proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
++proc-$(CONFIG_PROC_IMG) += img_dump.o
+Index: linux-2.6.git/fs/proc/base.c
+===================================================================
+--- linux-2.6.git.orig/fs/proc/base.c
++++ linux-2.6.git/fs/proc/base.c
+@@ -2983,6 +2983,9 @@ static const struct pid_entry tgid_base_
+ #endif
+ INF("cmdline", S_IRUGO, proc_pid_cmdline),
+ ONE("stat", S_IRUGO, proc_tgid_stat),
++#ifdef CONFIG_PROC_IMG
++ REG("dump", S_IRUSR|S_IWUSR, proc_pid_dump_operations),
++#endif
+ ONE("statm", S_IRUGO, proc_pid_statm),
+ REG("maps", S_IRUGO, proc_maps_operations),
+ #ifdef CONFIG_NUMA
+Index: linux-2.6.git/fs/proc/img_dump.c
+===================================================================
+--- /dev/null
++++ linux-2.6.git/fs/proc/img_dump.c
+@@ -0,0 +1,397 @@
++#include <linux/proc_fs.h>
++#include <linux/sched.h>
++#include <linux/uaccess.h>
++#include <linux/binfmt_img.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/highmem.h>
++#include <linux/types.h>
++#include "internal.h"
++
++static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos)
++{
++ int ret;
++ static size_t dumped = 0;
++
++ len -= pos;
++ if (len > size)
++ len = size;
++
++ ret = copy_to_user(ubuf, buf + pos, len);
++ if (ret)
++ return -EFAULT;
++
++ dumped += len;
++ return len;
++}
++
++static int img_dump_header(char __user *buf, size_t size, int pos)
++{
++ struct binfmt_img_header hdr;
++
++ hdr.magic = BINFMT_IMG_MAGIC;
++ hdr.version = BINFMT_IMG_VERS_0;
++
++ return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos);
++}
++
++static __u16 encode_segment(unsigned short seg)
++{
++ if (seg == 0)
++ return CKPT_X86_SEG_NULL;
++ BUG_ON((seg & 3) != 3);
++
++ if (seg == __USER_CS)
++ return CKPT_X86_SEG_USER64_CS;
++ if (seg == __USER_DS)
++ return CKPT_X86_SEG_USER64_DS;
++#ifdef CONFIG_COMPAT
++ if (seg == __USER32_CS)
++ return CKPT_X86_SEG_USER32_CS;
++ if (seg == __USER32_DS)
++ return CKPT_X86_SEG_USER32_DS;
++#endif
++
++ if (seg & 4)
++ return CKPT_X86_SEG_LDT | (seg >> 3);
++
++ seg >>= 3;
++ if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
++ return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
++
++ printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
++ BUG();
++}
++
++static __u64 encode_tls(struct desc_struct *d)
++{
++ return ((__u64)d->a << 32) + d->b;
++}
++
++static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos)
++{
++ struct binfmt_regs_image regi;
++ struct pt_regs *regs;
++ int i;
++
++ regs = task_pt_regs(p);
++
++ regi.r15 = regs->r15;
++ regi.r14 = regs->r14;
++ regi.r13 = regs->r13;
++ regi.r12 = regs->r12;
++ regi.r11 = regs->r11;
++ regi.r10 = regs->r10;
++ regi.r9 = regs->r9;
++ regi.r8 = regs->r8;
++ regi.ax = regs->ax;
++ regi.orig_ax = regs->orig_ax;
++ regi.bx = regs->bx;
++ regi.cx = regs->cx;
++ regi.dx = regs->dx;
++ regi.si = regs->si;
++ regi.di = regs->di;
++ regi.ip = regs->ip;
++ regi.flags = regs->flags;
++ regi.bp = regs->bp;
++ regi.sp = regs->sp;
++
++ /* segments */
++ regi.gsindex = encode_segment(p->thread.gsindex);
++ regi.fsindex = encode_segment(p->thread.fsindex);
++ regi.cs = encode_segment(regs->cs);
++ regi.ss = encode_segment(regs->ss);
++ regi.ds = encode_segment(p->thread.ds);
++ regi.es = encode_segment(p->thread.es);
++
++ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++ regi.tls[i] = encode_tls(&p->thread.tls_array[i]);
++
++ if (p->thread.gsindex)
++ regi.gs = 0;
++ else
++ regi.gs = p->thread.gs;
++
++ if (p->thread.fsindex)
++ regi.fs = 0;
++ else
++ regi.fs = p->thread.fs;
++
++ return img_dump_buffer(buf, size, &regi, sizeof(regi), pos);
++}
++
++static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos)
++{
++ struct binfmt_mm_image mmi;
++
++ mmi.flags = mm->flags;
++ mmi.def_flags = mm->def_flags;
++ mmi.start_code = mm->start_code;
++ mmi.end_code = mm->end_code;
++ mmi.start_data = mm->start_data;
++ mmi.end_data = mm->end_data;
++ mmi.start_brk = mm->start_brk;
++ mmi.brk = mm->brk;
++ mmi.start_stack = mm->start_stack;
++ mmi.arg_start = mm->arg_start;
++ mmi.arg_end = mm->arg_end;
++ mmi.env_start = mm->env_start;
++ mmi.env_end = mm->env_end;
++ mmi.exe_fd = 0;
++
++ return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos);
++}
++
++static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos)
++{
++ struct binfmt_vma_image vmai;
++
++ if (vma == NULL) {
++ memset(&vmai, 0, sizeof(vmai));
++ goto dumpit;
++ }
++
++ printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm);
++
++ vmai.fd = 0;
++ vmai.prot = 0;
++ if (vma->vm_flags & VM_READ)
++ vmai.prot |= PROT_READ;
++ if (vma->vm_flags & VM_WRITE)
++ vmai.prot |= PROT_WRITE;
++ if (vma->vm_flags & VM_EXEC)
++ vmai.prot |= PROT_EXEC;
++
++ vmai.flags = 0;
++ if (vma->vm_file == NULL)
++ vmai.flags |= MAP_ANONYMOUS;
++ if (vma->vm_flags & VM_MAYSHARE)
++ vmai.flags |= MAP_SHARED;
++ else
++ vmai.flags |= MAP_PRIVATE;
++
++ vmai.start = vma->vm_start;
++ vmai.end = vma->vm_end;
++ vmai.pgoff = vma->vm_pgoff;
++
++dumpit:
++ return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos);
++}
++
++static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos)
++{
++ struct binfmt_page_image pgi;
++ int ret = 0, tmp;
++
++ pgi.vaddr = addr;
++
++ if (pos < sizeof(pgi)) {
++ tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos);
++ if (tmp < 0)
++ return tmp;
++
++ ret = tmp;
++ if (size <= ret)
++ return ret;
++
++ buf += ret;
++ size -= ret;
++ pos = 0;
++ } else
++ pos -= sizeof(pgi);
++
++ tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos);
++ if (tmp < 0)
++ return tmp;
++
++ return ret + tmp;
++}
++
++static inline int is_private_vma(struct vm_area_struct *vma)
++{
++ if (vma->vm_file == NULL)
++ return 1;
++ if (!(vma->vm_flags & VM_SHARED))
++ return 1;
++ return 0;
++}
++
++static ssize_t do_produce_dump(struct task_struct *p, char __user *buf,
++ size_t size, loff_t *ppos)
++{
++ size_t img_pos = 0, img_ppos;
++ size_t produced = 0;
++ int len;
++ loff_t pos = *ppos;
++ struct mm_struct *mm;
++ struct vm_area_struct *vma;
++
++#define move_pos(); do { \
++ buf += len; \
++ produced += len;\
++ size -= len; \
++ pos += len; \
++ } while (0)
++
++#define seek_pos(__size); do { \
++ img_ppos = img_pos; \
++ img_pos += (__size); \
++ } while (0)
++
++ /* header */
++ seek_pos(sizeof(struct binfmt_img_header));
++ if (pos < img_pos) {
++ len = img_dump_header(buf, size, pos - img_ppos);
++ if (len < 0)
++ goto err;
++
++ move_pos();
++ if (size == 0)
++ goto out;
++ }
++
++ /* registers */
++ seek_pos(sizeof(struct binfmt_regs_image));
++ if (pos < img_pos) {
++ len = img_dump_regs(p, buf, size, pos - img_ppos);
++ if (len < 0)
++ goto err;
++
++ move_pos();
++ if (size == 0)
++ goto out;
++ }
++
++ /* memory */
++ mm = get_task_mm(p);
++ if (mm == NULL)
++ return -EACCES;
++
++ down_read(&mm->mmap_sem);
++
++ seek_pos(sizeof(struct binfmt_mm_image));
++ if (pos < img_pos) {
++ len = img_dump_mm(mm, buf, size, pos - img_ppos);
++ if (len < 0)
++ goto err_mm;
++
++ move_pos();
++ if (size == 0)
++ goto out_mm;
++ }
++
++ vma = mm->mmap;
++ while (1) {
++ seek_pos(sizeof(struct binfmt_vma_image));
++ if (pos < img_pos) {
++ len = img_dump_vma(vma, buf, size, pos - img_ppos);
++ if (len < 0)
++ goto err_mm;
++
++ move_pos();
++ if (size == 0)
++ goto out_mm;
++ }
++
++ if (vma == NULL)
++ break;
++
++ vma = vma->vm_next;
++ }
++
++ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
++ /* slow and stupid */
++ unsigned long addr;
++ struct page *page;
++ void *pg_data;
++
++ if (!is_private_vma(vma))
++ continue;
++
++ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
++ page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET);
++ if (page == NULL)
++ continue;
++ if (IS_ERR(page)) /* huh? */
++ continue;
++
++ seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE);
++ if (pos < img_pos) {
++ pg_data = kmap(page);
++ len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos);
++ kunmap(page);
++
++ if (len < 0) {
++ put_page(page);
++ goto err_mm;
++ }
++
++ move_pos();
++ if (size == 0) {
++ put_page(page);
++ goto out_mm;
++ }
++ }
++
++ put_page(page);
++ }
++ }
++
++ seek_pos(sizeof(struct binfmt_page_image));
++ if (pos < img_pos) {
++ struct binfmt_page_image zero;
++
++ memset(&zero, 0, sizeof(zero));
++ len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos);
++ if (len < 0)
++ goto err;
++
++ move_pos();
++ }
++
++out_mm:
++ up_read(&mm->mmap_sem);
++ mmput(mm);
++out:
++ *ppos = pos;
++ return produced;
++
++err_mm:
++ up_read(&mm->mmap_sem);
++ mmput(mm);
++err:
++ return len;
++}
++
++static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
++{
++ struct task_struct *p;
++
++ p = get_proc_task(file->f_dentry->d_inode);
++ if (p == NULL)
++ return -ESRCH;
++
++ if (!(p->state & TASK_STOPPED)) {
++ put_task_struct(p);
++ return -EINVAL;
++ }
++
++ return do_produce_dump(p, buf, size, ppos);
++}
++
++static int img_dump_open(struct inode *inode, struct file *filp)
++{
++ return 0;
++}
++
++static int img_dump_release(struct inode *inode, struct file *filp)
++{
++ return 0;
++}
++
++const struct file_operations proc_pid_dump_operations = {
++ .open = img_dump_open,
++ .read = img_dump_read,
++ .release = img_dump_release,
++};
+Index: linux-2.6.git/include/linux/binfmt_img.h
+===================================================================
+--- /dev/null
++++ linux-2.6.git/include/linux/binfmt_img.h
+@@ -0,0 +1,87 @@
++#ifndef __BINFMT_IMG_H__
++#define __BINFMT_IMG_H__
++
++#include <linux/types.h>
++
++struct binfmt_img_header {
++ __u32 magic;
++ __u32 version;
++};
++
++#define CKPT_TLS_ENTRIES 3
++
++struct binfmt_regs_image {
++ __u64 r15;
++ __u64 r14;
++ __u64 r13;
++ __u64 r12;
++ __u64 r11;
++ __u64 r10;
++ __u64 r9;
++ __u64 r8;
++ __u64 ax;
++ __u64 orig_ax;
++ __u64 bx;
++ __u64 cx;
++ __u64 dx;
++ __u64 si;
++ __u64 di;
++ __u64 ip;
++ __u64 flags;
++ __u64 bp;
++ __u64 sp;
++
++ __u64 gs;
++ __u64 fs;
++ __u64 tls[CKPT_TLS_ENTRIES];
++ __u16 gsindex;
++ __u16 fsindex;
++ __u16 cs;
++ __u16 ss;
++ __u16 ds;
++ __u16 es;
++};
++
++#define CKPT_X86_SEG_NULL 0
++#define CKPT_X86_SEG_USER32_CS 1
++#define CKPT_X86_SEG_USER32_DS 2
++#define CKPT_X86_SEG_USER64_CS 3
++#define CKPT_X86_SEG_USER64_DS 4
++#define CKPT_X86_SEG_TLS 0x4000
++#define CKPT_X86_SEG_LDT 0x8000
++
++struct binfmt_mm_image {
++ __u64 flags;
++ __u64 def_flags;
++ __u64 start_code;
++ __u64 end_code;
++ __u64 start_data;
++ __u64 end_data;
++ __u64 start_brk;
++ __u64 brk;
++ __u64 start_stack;
++ __u64 arg_start;
++ __u64 arg_end;
++ __u64 env_start;
++ __u64 env_end;
++ __u32 exe_fd;
++};
++
++struct binfmt_vma_image {
++ __u32 prot;
++ __u32 flags;
++ __u32 pad;
++ __u32 fd;
++ __u64 start;
++ __u64 end;
++ __u64 pgoff;
++};
++
++struct binfmt_page_image {
++ __u64 vaddr;
++};
++
++#define BINFMT_IMG_MAGIC 0xa75b8d43
++#define BINFMT_IMG_VERS_0 0x00000100
++
++#endif
+Index: linux-2.6.git/include/linux/proc_fs.h
+===================================================================
+--- linux-2.6.git.orig/include/linux/proc_fs.h
++++ linux-2.6.git/include/linux/proc_fs.h
+@@ -102,6 +102,8 @@ struct vmcore {
+
+ #ifdef CONFIG_PROC_FS
+
++extern const struct file_operations proc_pid_dump_operations;
++
+ extern void proc_root_init(void);
+
+ void proc_flush_task(struct task_struct *task);
diff --git a/xemul/0004-Images-execution-binfmt-handler.patch b/xemul/0004-Images-execution-binfmt-handler.patch
new file mode 100644
index 000000000..4e6c69e3b
--- /dev/null
+++ b/xemul/0004-Images-execution-binfmt-handler.patch
@@ -0,0 +1,371 @@
+From 0f8e07457aa91e9461665440ca258eb9f93bf2f9 Mon Sep 17 00:00:00 2001
+From: root <root@ovzept.sw.ru>
+Date: Fri, 3 Jun 2011 18:16:43 +0400
+Subject: [PATCH] Images execution binfmt handler
+
+---
+ fs/Kconfig.binfmt | 6 +
+ fs/Makefile | 1 +
+ fs/binfmt_img.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 331 insertions(+), 0 deletions(-)
+ create mode 100644 fs/binfmt_img.c
+
+diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
+index 79e2ca7..0b2f48e 100644
+--- a/fs/Kconfig.binfmt
++++ b/fs/Kconfig.binfmt
+@@ -161,3 +161,9 @@ config BINFMT_MISC
+ You may say M here for module support and later load the module when
+ you have use for it; the module is called binfmt_misc. If you
+ don't know what to answer at this point, say Y.
++
++config BINFMT_IMG
++ tristate "Kernel support for IMG binaries"
++ depends on X86
++ help
++ Say M/Y here to enable support for checkpoint-restore images execution
+diff --git a/fs/Makefile b/fs/Makefile
+index fb68c2b..8221719 100644
+--- a/fs/Makefile
++++ b/fs/Makefile
+@@ -33,6 +33,7 @@ obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o
+ obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
+ obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
+ obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
++obj-$(CONFIG_BINFMT_IMG) += binfmt_img.o
+
+ # binfmt_script is always there
+ obj-y += binfmt_script.o
+diff --git a/fs/binfmt_img.c b/fs/binfmt_img.c
+new file mode 100644
+index 0000000..9b09797
+--- /dev/null
++++ b/fs/binfmt_img.c
+@@ -0,0 +1,324 @@
++#include <linux/binfmt_img.h>
++#include <linux/module.h>
++#include <linux/binfmts.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/highmem.h>
++#include <asm/tlbflush.h>
++#include <asm/desc.h>
++
++/*
++ * The binary handler to save and restore a single task state
++ */
++
++static int img_check_header(void *buf)
++{
++ struct binfmt_img_header *hdr = buf;
++
++ if (hdr->magic != BINFMT_IMG_MAGIC)
++ return -ENOEXEC;
++
++ if (hdr->version != BINFMT_IMG_VERS_0)
++ return -EINVAL;
++
++ return sizeof(*hdr);
++}
++
++static unsigned short decode_segment(__u16 seg)
++{
++ if (seg == CKPT_X86_SEG_NULL)
++ return 0;
++
++ if (seg == CKPT_X86_SEG_USER64_CS)
++ return __USER_CS;
++ if (seg == CKPT_X86_SEG_USER64_DS)
++ return __USER_DS;
++#ifdef CONFIG_COMPAT
++ if (seg == CKPT_X86_SEG_USER32_CS)
++ return __USER32_CS;
++ if (seg == CKPT_X86_SEG_USER32_DS)
++ return __USER32_DS;
++#endif
++
++ if (seg & CKPT_X86_SEG_TLS) {
++ seg &= ~CKPT_X86_SEG_TLS;
++ return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
++ }
++ if (seg & CKPT_X86_SEG_LDT) {
++ seg &= ~CKPT_X86_SEG_LDT;
++ return (seg << 3) | 7;
++ }
++ BUG();
++}
++
++static void decode_tls(struct desc_struct *d, __u64 val)
++{
++ d->a = (unsigned int)(val >> 32);
++ d->b = (unsigned int)(val & 0xFFFFFFFF);
++}
++
++static int img_restore_regs(struct linux_binprm *bprm, loff_t off, struct pt_regs *regs)
++{
++ int ret, i;
++ struct binfmt_regs_image regi;
++ struct thread_struct *th = &current->thread;
++ unsigned short seg;
++
++ ret = kernel_read(bprm->file, off, (char *)&regi, sizeof(regi));
++ if (ret != sizeof(regi))
++ return -EIO;
++
++ regs->r15 = regi.r15;
++ regs->r14 = regi.r14;
++ regs->r13 = regi.r13;
++ regs->r12 = regi.r12;
++ regs->r11 = regi.r11;
++ regs->r10 = regi.r10;
++ regs->r9 = regi.r9;
++ regs->r8 = regi.r8;
++ regs->ax = regi.ax;
++ regs->orig_ax = regi.orig_ax;
++ regs->bx = regi.bx;
++ regs->cx = regi.cx;
++ regs->dx = regi.dx;
++ regs->si = regi.si;
++ regs->di = regi.di;
++ regs->ip = regi.ip;
++ regs->flags = regi.flags;
++ regs->bp = regi.bp;
++ regs->sp = regi.sp;
++
++ regs->cs = decode_segment(regi.cs);
++ regs->ss = decode_segment(regi.ss);
++
++ th->usersp = regi.sp;
++ th->ds = decode_segment(regi.ds);
++ th->es = decode_segment(regi.es);
++ th->fsindex = decode_segment(regi.fsindex);
++ th->gsindex = decode_segment(regi.gsindex);
++
++ th->fs = regi.fs;
++ th->gs = regi.gs;
++
++ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
++ decode_tls(&th->tls_array[i], regi.tls[i]);
++
++ load_TLS(th, smp_processor_id());
++
++ seg = th->fsindex;
++ loadsegment(fs, seg);
++ savesegment(fs, seg);
++ if (seg != th->fsindex) {
++ printk("ERROR saving fs selector want %x, has %x\n",
++ (unsigned int)th->fsindex, (unsigned int)seg);
++ return -EFAULT;
++ }
++
++ if (th->fs)
++ wrmsrl(MSR_FS_BASE, th->fs);
++ load_gs_index(th->gsindex);
++ if (th->gs)
++ wrmsrl(MSR_KERNEL_GS_BASE, th->gs);
++
++ return sizeof(regi);
++}
++
++static int img_restore_mm(struct linux_binprm *bprm, loff_t off)
++{
++ int ret;
++ struct binfmt_mm_image mmi;
++ struct mm_struct *mm = current->mm;
++
++ ret = kernel_read(bprm->file, off, (char *)&mmi, sizeof(mmi));
++ if (ret != sizeof(mmi))
++ return -EIO;
++
++ mm->flags = mmi.flags;
++ mm->def_flags = mmi.def_flags;
++ mm->start_code = mmi.start_code;
++ mm->end_code = mmi.end_code;
++ mm->start_data = mmi.start_data;
++ mm->end_data = mmi.end_data;
++ mm->start_brk = mmi.start_brk;
++ mm->brk = mmi.brk;
++ mm->start_stack = mmi.start_stack;
++ mm->arg_start = mmi.arg_start;
++ mm->arg_end = mmi.arg_end;
++ mm->env_start = mmi.env_start;
++ mm->env_end = mmi.env_end;
++
++ if (mmi.exe_fd != 0) {
++ struct file *f;
++
++ f = fget(mmi.exe_fd);
++ if (f == NULL)
++ return -EBADF;
++
++ fput(mm->exe_file);
++ mm->exe_file = f;
++ }
++
++ return sizeof(mmi);
++}
++
++static int img_restore_vmas(struct linux_binprm *bprm, loff_t off)
++{
++ int ret;
++ struct mm_struct *mm = current->mm;
++ int len = 0;
++
++ do_munmap(mm, 0, TASK_SIZE);
++
++ while (1) {
++ struct binfmt_vma_image vmai;
++ unsigned long addr;
++ struct file *file = NULL;
++
++ len += sizeof(vmai);
++
++ ret = kernel_read(bprm->file, off, (char *)&vmai, sizeof(vmai));
++ if (ret != sizeof(vmai))
++ return -EIO;
++
++ if (vmai.start == 0 && vmai.end == 0)
++ break;
++
++ if (vmai.fd != 0) {
++ file = fget(vmai.fd);
++ if (file == NULL)
++ return -EBADF;
++ } else
++ vmai.flags |= MAP_ANONYMOUS;
++
++ if (vmai.start <= mm->start_stack && vmai.end >= mm->start_stack)
++ vmai.flags |= MAP_GROWSDOWN;
++
++ addr = do_mmap_pgoff(file, vmai.start, vmai.end - vmai.start,
++ vmai.prot, vmai.flags | MAP_FIXED, vmai.pgoff);
++
++ if (vmai.fd) {
++ fput(file);
++ do_close(vmai.fd);
++ }
++
++ if ((long)addr < 0 || (addr != vmai.start))
++ return -ENXIO;
++
++ off += sizeof(vmai);
++ }
++
++ return len;
++}
++
++static int img_restore_pages(struct linux_binprm *bprm, loff_t off)
++{
++ int ret;
++ struct mm_struct *mm = current->mm;
++ int len = 0;
++
++ while (1) {
++ struct binfmt_page_image pgi;
++ struct vm_area_struct *vma;
++ struct page *page;
++ void *pg_data;
++
++ ret = kernel_read(bprm->file, off, (char *)&pgi, sizeof(pgi));
++ if (ret != sizeof(pgi))
++ return -EIO;
++
++ len += sizeof(pgi);
++ if (pgi.vaddr == 0)
++ break;
++
++ vma = find_vma(mm, pgi.vaddr);
++ if (vma == NULL)
++ return -ESRCH;
++
++ ret = get_user_pages(current, current->mm, (unsigned long)pgi.vaddr,
++ 1, 1, 1, &page, NULL);
++ if (ret != 1)
++ return -EFAULT;
++
++ pg_data = kmap(page);
++ ret = kernel_read(bprm->file, off + sizeof(pgi), pg_data, PAGE_SIZE);
++ kunmap(page);
++ put_page(page);
++
++ if (ret != PAGE_SIZE)
++ return -EFAULT;
++
++ len += PAGE_SIZE;
++ off += sizeof(pgi) + PAGE_SIZE;
++ }
++
++ return len;
++}
++
++static int img_restore_mem(struct linux_binprm *bprm, loff_t off)
++{
++ int ret;
++ loff_t len = off;
++
++ ret = img_restore_mm(bprm, len);
++ if (ret < 0)
++ return ret;
++
++ len += ret;
++ ret = img_restore_vmas(bprm, len);
++ if (ret < 0)
++ return ret;
++
++ len += ret;
++ ret = img_restore_pages(bprm, len);
++ if (ret < 0)
++ return ret;
++
++ len += ret;
++ return len;
++
++}
++
++static int img_load_binary(struct linux_binprm * bprm, struct pt_regs * regs)
++{
++ int ret;
++ loff_t len = 0;
++
++ ret = img_check_header(bprm->buf);
++ if (ret < 0)
++ return ret;
++
++ len += ret;
++ ret = img_restore_regs(bprm, len, regs);
++ if (ret < 0)
++ return ret;
++
++ len += ret;
++ ret = img_restore_mem(bprm, len);
++ if (ret < 0)
++ return ret;
++
++ return 0;
++}
++
++static struct linux_binfmt img_binfmt = {
++ .module = THIS_MODULE,
++ .load_binary = img_load_binary,
++};
++
++static __init int img_binfmt_init(void)
++{
++ return register_binfmt(&img_binfmt);
++}
++
++static __exit void img_binfmt_exit(void)
++{
++ unregister_binfmt(&img_binfmt);
++}
++
++module_init(img_binfmt_init);
++module_exit(img_binfmt_exit);
++MODULE_LICENSE("GPL");
+--
+1.5.5.6
+
diff --git a/xemul/binfmt_img.h b/xemul/binfmt_img.h
new file mode 100644
index 000000000..8775d92ab
--- /dev/null
+++ b/xemul/binfmt_img.h
@@ -0,0 +1,96 @@
+#ifndef __BINFMT_IMG_H__
+#define __BINFMT_IMG_H__
+
+#include <linux/types.h>
+
+#define __packed __attribute__((packed))
+
+struct binfmt_img_header {
+ __u32 magic;
+ __u32 version;
+ __u16 arch;
+ __u16 flags;
+} __packed;
+
+#define CKPT_TLS_ENTRIES 3
+
+struct binfmt_regs_image {
+ union {
+ struct {
+ __u64 r15;
+ __u64 r14;
+ __u64 r13;
+ __u64 r12;
+ __u64 r11;
+ __u64 r10;
+ __u64 r9;
+ __u64 r8;
+ __u64 ax;
+ __u64 orig_ax;
+ __u64 bx;
+ __u64 cx;
+ __u64 dx;
+ __u64 si;
+ __u64 di;
+ __u64 ip;
+ __u64 flags;
+ __u64 bp;
+ __u64 sp;
+
+ __u64 gs;
+ __u64 fs;
+ __u64 tls[CKPT_TLS_ENTRIES];
+ __u16 gsindex;
+ __u16 fsindex;
+ __u16 cs;
+ __u16 ss;
+ __u16 ds;
+ __u16 es;
+ } r;
+ __u64 dummy[32];
+ };
+} __packed;
+
+#define CKPT_X86_SEG_NULL 0
+#define CKPT_X86_SEG_USER32_CS 1
+#define CKPT_X86_SEG_USER32_DS 2
+#define CKPT_X86_SEG_USER64_CS 3
+#define CKPT_X86_SEG_USER64_DS 4
+#define CKPT_X86_SEG_TLS 0x4000
+#define CKPT_X86_SEG_LDT 0x8000
+
+struct binfmt_mm_image {
+ __u64 flags;
+ __u64 def_flags;
+ __u64 start_code;
+ __u64 end_code;
+ __u64 start_data;
+ __u64 end_data;
+ __u64 start_brk;
+ __u64 brk;
+ __u64 start_stack;
+ __u64 arg_start;
+ __u64 arg_end;
+ __u64 env_start;
+ __u64 env_end;
+ __u32 exe_fd;
+} __packed;
+
+struct binfmt_vma_image {
+ __u32 prot;
+ __u32 flags;
+ __u32 pad;
+ __u32 fd;
+ __u64 start;
+ __u64 end;
+ __u64 pgoff;
+} __packed;
+
+struct binfmt_page_image {
+ __u64 vaddr;
+} __packed;
+
+#define BINFMT_IMG_MAGIC 0xa75b8d43
+#define BINFMT_IMG_VERS_0 0x00000100
+
+#endif
diff --git a/xemul/cr-dump.c b/xemul/cr-dump.c
new file mode 100644
index 000000000..01154e9f8
--- /dev/null
+++ b/xemul/cr-dump.c
@@ -0,0 +1,781 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <dirent.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <linux/kdev_t.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+
+#include <linux/types.h>
+#include "img_structs.h"
+
+static int fdinfo_img;
+static int pages_img;
+static int core_img;
+static int shmem_img;
+static int pipes_img;
+
+#define PIPEFS_MAGIC 0x50495045
+
+static int prep_img_files(int pid)
+{
+ __u32 type;
+ char name[64];
+
+ sprintf(name, "fdinfo-%d.img", pid);
+ fdinfo_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
+ if (fdinfo_img < 0) {
+ perror("Can't open fdinfo");
+ return 1;
+ }
+
+ type = FDINFO_MAGIC;
+ write(fdinfo_img, &type, 4);
+
+ sprintf(name, "pages-%d.img", pid);
+ pages_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
+ if (pages_img < 0) {
+ perror("Can't open shmem");
+ return 1;
+ }
+
+ type = PAGES_MAGIC;
+ write(pages_img, &type, 4);
+
+ sprintf(name, "core-%d.img", pid);
+ core_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
+ if (core_img < 0) {
+ perror("Can't open core");
+ return 1;
+ }
+
+ sprintf(name, "shmem-%d.img", pid);
+ shmem_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
+ if (shmem_img < 0) {
+ perror("Can't open shmem");
+ return 1;
+ }
+
+ type = SHMEM_MAGIC;
+ write(shmem_img, &type, 4);
+
+ sprintf(name, "pipes-%d.img", pid);
+ pipes_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
+ if (pipes_img < 0) {
+ perror("Can't open pipes");
+ return 1;
+ }
+
+ type = PIPES_MAGIC;
+ write(pipes_img, &type, 4);
+
+ return 0;
+}
+
+static void kill_imgfiles(int pid)
+{
+ /* FIXME */
+}
+
+static int stop_task(int pid)
+{
+ return kill(pid, SIGSTOP);
+}
+
+static void continue_task(int pid)
+{
+ if (kill(pid, SIGCONT))
+ perror("Can't cont task");
+}
+
+static char big_tmp_str[PATH_MAX];
+
+static int read_fd_params(int pid, char *fd, unsigned long *pos, unsigned int *flags)
+{
+ char fd_str[128];
+ int ifd;
+
+ sprintf(fd_str, "/proc/%d/fdinfo/%s", pid, fd);
+
+ printf("\tGetting fdinfo for fd %s\n", fd);
+ ifd = open(fd_str, O_RDONLY);
+ if (ifd < 0) {
+ perror("Can't open fdinfo");
+ return 1;
+ }
+
+ read(ifd, big_tmp_str, sizeof(big_tmp_str));
+ close(ifd);
+
+ sscanf(big_tmp_str, "pos:\t%lli\nflags:\t%o\n", pos, flags);
+ return 0;
+}
+
+static int dump_one_reg_file(int type, unsigned long fd_name, int lfd,
+ int lclose, unsigned long pos, unsigned int flags)
+{
+ char fd_str[128];
+ int len;
+ struct fdinfo_entry e;
+
+ sprintf(fd_str, "/proc/self/fd/%d", lfd);
+ len = readlink(fd_str, big_tmp_str, sizeof(big_tmp_str) - 1);
+ if (len < 0) {
+ perror("Can't readlink fd");
+ return 1;
+ }
+
+ big_tmp_str[len] = '\0';
+ printf("\tDumping path for %x fd via self %d [%s]\n", fd_name, lfd, big_tmp_str);
+
+ if (lclose)
+ close(lfd);
+
+ e.type = type;
+ e.addr = fd_name;
+ e.len = len;
+ e.pos = pos;
+ e.flags = flags;
+
+ write(fdinfo_img, &e, sizeof(e));
+ write(fdinfo_img, big_tmp_str, len);
+
+ return 0;
+}
+
+#define MAX_PIPE_BUF_SIZE 1024 /* FIXME - this is not so */
+#define SPLICE_F_NONBLOCK 0x2
+
+static int dump_pipe_and_data(int lfd, struct pipes_entry *e)
+{
+ int steal_pipe[2];
+ int ret;
+
+ printf("\tDumping data from pipe %x\n", e->pipeid);
+ if (pipe(steal_pipe) < 0) {
+ perror("Can't create pipe for stealing data");
+ return 1;
+ }
+
+ ret = tee(lfd, steal_pipe[1], MAX_PIPE_BUF_SIZE, SPLICE_F_NONBLOCK);
+ if (ret < 0) {
+ if (errno != EAGAIN) {
+ perror("Can't pick pipe data");
+ return 1;
+ }
+
+ ret = 0;
+ }
+
+ e->bytes = ret;
+ write(pipes_img, e, sizeof(*e));
+
+ if (ret) {
+ ret = splice(steal_pipe[0], NULL, pipes_img, NULL, ret, 0);
+ if (ret < 0) {
+ perror("Can't push pipe data");
+ return 1;
+ }
+ }
+
+ close(steal_pipe[0]);
+ close(steal_pipe[1]);
+ return 0;
+}
+
+static int dump_one_pipe(int fd, int lfd, unsigned int id, unsigned int flags)
+{
+ struct pipes_entry e;
+
+ printf("\tDumping pipe %d/%x flags %x\n", fd, id, flags);
+
+ e.fd = fd;
+ e.pipeid = id;
+ e.flags = flags;
+
+ if (flags & O_WRONLY) {
+ e.bytes = 0;
+ write(pipes_img, &e, sizeof(e));
+ return 0;
+ }
+
+ return dump_pipe_and_data(lfd, &e);
+}
+
+static int dump_one_fd(int dir, char *fd_name, unsigned long pos, unsigned int flags)
+{
+ int fd;
+ struct stat st_buf;
+ struct statfs stfs_buf;
+
+ printf("\tDumping fd %s\n", fd_name);
+ fd = openat(dir, fd_name, O_RDONLY);
+ if (fd == -1) {
+ printf("Tried to openat %d/%d %s\n", getpid(), dir, fd_name);
+ perror("Can't open fd");
+ return 1;
+ }
+
+ if (fstat(fd, &st_buf) < 0) {
+ perror("Can't stat one");
+ return 1;
+ }
+
+ if (S_ISREG(st_buf.st_mode))
+ return dump_one_reg_file(FDINFO_FD, atoi(fd_name), fd, 1, pos, flags);
+
+ if (S_ISFIFO(st_buf.st_mode)) {
+ if (fstatfs(fd, &stfs_buf) < 0) {
+ perror("Can't statfs one");
+ return 1;
+ }
+
+ if (stfs_buf.f_type == PIPEFS_MAGIC)
+ return dump_one_pipe(atoi(fd_name), fd, st_buf.st_ino, flags);
+ }
+
+ if (!strcmp(fd_name, "0")) {
+ printf("\tSkipping stdin\n");
+ return 0;
+ }
+
+ if (!strcmp(fd_name, "1")) {
+ printf("\tSkipping stdout\n");
+ return 0;
+ }
+
+ if (!strcmp(fd_name, "2")) {
+ printf("\tSkipping stderr\n");
+ return 0;
+ }
+
+ if (!strcmp(fd_name, "3")) {
+ printf("\tSkipping tty\n");
+ return 0;
+ }
+
+ fprintf(stderr, "Can't dump file %s of that type [%x]\n", fd_name, st_buf.st_mode);
+ return 1;
+
+}
+
+static int dump_task_files(int pid)
+{
+ char pid_fd_dir[64];
+ DIR *fd_dir;
+ struct dirent *de;
+ unsigned long pos;
+ unsigned int flags;
+
+ printf("Dumping open files for %d\n", pid);
+
+ sprintf(pid_fd_dir, "/proc/%d/fd", pid);
+ fd_dir = opendir(pid_fd_dir);
+ if (fd_dir == NULL) {
+ perror("Can't open fd dir");
+ return -1;
+ }
+
+ while ((de = readdir(fd_dir)) != NULL) {
+ if (de->d_name[0] == '.')
+ continue;
+
+ if (read_fd_params(pid, de->d_name, &pos, &flags))
+ return 1;
+
+ if (dump_one_fd(dirfd(fd_dir), de->d_name, pos, flags))
+ return 1;
+ }
+
+ closedir(fd_dir);
+ return 0;
+}
+
+#define PAGE_SIZE 4096
+#define PAGE_RSS 0x1
+
+static unsigned long rawhex(char *str, char **end)
+{
+ unsigned long ret = 0;
+
+ while (1) {
+ if (str[0] >= '0' && str[0] <= '9') {
+ ret <<= 4;
+ ret += str[0] - '0';
+ } else if (str[0] >= 'a' && str[0] <= 'f') {
+ ret <<= 4;
+ ret += str[0] - 'a' + 0xA;
+ } else if (str[0] >= 'A' && str[0] <= 'F') {
+ ret <<= 4;
+ ret += str[0] - 'A' + 0xA;
+ } else {
+ if (end)
+ *end = str;
+ return ret;
+ }
+
+ str++;
+ }
+}
+
+static void map_desc_parm(char *desc, unsigned long *pgoff, unsigned long *len)
+{
+ char *s;
+ unsigned long start, end;
+
+ start = rawhex(desc, &s);
+ if (*s != '-') {
+ goto bug;
+ }
+
+ end = rawhex(s + 1, &s);
+ if (*s != ' ') {
+ goto bug;
+ }
+
+ s = strchr(s + 1, ' ');
+ *pgoff = rawhex(s + 1, &s);
+ if (*s != ' ') {
+ goto bug;
+ }
+
+ if (start > end)
+ goto bug;
+
+ *len = end - start;
+
+ if (*len % PAGE_SIZE) {
+ goto bug;
+ }
+ if (*pgoff % PAGE_SIZE) {
+ goto bug;
+ }
+
+ return;
+bug:
+ fprintf(stderr, "BUG\n");
+ exit(1);
+}
+
+static int dump_map_pages(int lfd, unsigned long start, unsigned long pgoff, unsigned long len)
+{
+ unsigned int nrpages, pfn;
+ void *mem;
+ unsigned char *mc;
+
+ printf("\t\tDumping pages start %x len %x off %x\n", start, len, pgoff);
+ mem = mmap(NULL, len, PROT_READ, MAP_FILE | MAP_PRIVATE, lfd, pgoff);
+ if (mem == MAP_FAILED) {
+ perror("Can't map");
+ return 1;
+ }
+
+ nrpages = len / PAGE_SIZE;
+ mc = malloc(nrpages);
+ if (mincore(mem, len, mc)) {
+ perror("Can't mincore mapping");
+ return 1;
+ }
+
+ for (pfn = 0; pfn < nrpages; pfn++)
+ if (mc[pfn] & PAGE_RSS) {
+ __u64 vaddr;
+
+ vaddr = start + pfn * PAGE_SIZE;
+ write(pages_img, &vaddr, 8);
+ write(pages_img, mem + pfn * PAGE_SIZE, PAGE_SIZE);
+ }
+
+ munmap(mem, len);
+
+ return 0;
+}
+
+static int dump_anon_private_map(char *start)
+{
+ printf("\tSkipping anon private mapping at %s\n", start);
+ return 0;
+}
+
+static int dump_anon_shared_map(char *_start, char *mdesc, int lfd, struct stat *st)
+{
+ unsigned long pgoff, len;
+ struct shmem_entry e;
+ unsigned long start;
+ struct stat buf;
+
+ map_desc_parm(mdesc, &pgoff, &len);
+
+ start = rawhex(_start, NULL);
+ e.start = start;
+ e.end = start + len;
+ e.shmid = st->st_ino;
+
+ write(shmem_img, &e, sizeof(e));
+
+ if (dump_map_pages(lfd, start, pgoff, len))
+ return 1;
+
+ close(lfd);
+ return 0;
+}
+
+static int dump_file_shared_map(char *start, char *mdesc, int lfd)
+{
+ printf("\tSkipping file shared mapping at %s\n", start);
+ close(lfd);
+ return 0;
+}
+
+static int dump_file_private_map(char *_start, char *mdesc, int lfd)
+{
+ unsigned long pgoff, len;
+ unsigned long start;
+
+ map_desc_parm(mdesc, &pgoff, &len);
+
+ start = rawhex(_start, NULL);
+ if (dump_one_reg_file(FDINFO_MAP, start, lfd, 0, 0, O_RDONLY))
+ return 1;
+
+ close(lfd);
+ return 0;
+}
+
+static int dump_one_mapping(char *mdesc, DIR *mfd_dir)
+{
+ char *flags, *tmp;
+ char map_start[32];
+ int lfd;
+ struct stat st_buf;
+
+ tmp = strchr(mdesc, '-');
+ memset(map_start, 0, sizeof(map_start));
+ strncpy(map_start, mdesc, tmp - mdesc);
+ flags = strchr(mdesc, ' ');
+ flags++;
+
+ printf("\tDumping %s\n", map_start);
+ lfd = openat(dirfd(mfd_dir), map_start, O_RDONLY);
+ if (lfd == -1) {
+ if (errno != ENOENT) {
+ perror("Can't open mapping");
+ return 1;
+ }
+
+ if (flags[3] != 'p') {
+ fprintf(stderr, "Bogus mapping [%s]\n", mdesc);
+ return 1;
+ }
+
+ return dump_anon_private_map(map_start);
+ }
+
+ if (fstat(lfd, &st_buf) < 0) {
+ perror("Can't stat mapping!");
+ return 1;
+ }
+
+ if (!S_ISREG(st_buf.st_mode)) {
+ perror("Can't handle non-regular mapping");
+ return 1;
+ }
+
+ if (MAJOR(st_buf.st_dev) == 0) {
+ if (flags[3] != 's') {
+ fprintf(stderr, "Bogus mapping [%s]\n", mdesc);
+ return 1;
+ }
+
+ /* FIXME - this can be tmpfs visible file mapping */
+ return dump_anon_shared_map(map_start, mdesc, lfd, &st_buf);
+ }
+
+ if (flags[3] == 'p')
+ return dump_file_private_map(map_start, mdesc, lfd);
+ else
+ return dump_file_shared_map(map_start, mdesc, lfd);
+}
+
+static int dump_task_ext_mm(int pid)
+{
+ char path[64];
+ DIR *mfd_dir;
+ FILE *maps;
+
+ printf("Dumping mappings for %d\n", pid);
+
+ sprintf(path, "/proc/%d/mfd", pid);
+ mfd_dir = opendir(path);
+ if (mfd_dir == NULL) {
+ perror("Can't open mfd dir");
+ return -1;
+ }
+
+ sprintf(path, "/proc/%d/maps", pid);
+ maps = fopen(path, "r");
+ if (maps == NULL) {
+ perror("Can't open maps file");
+ return 1;
+ }
+
+ while (fgets(big_tmp_str, sizeof(big_tmp_str), maps) != NULL)
+ if (dump_one_mapping(big_tmp_str, mfd_dir))
+ return 1;
+
+ fclose(maps);
+ closedir(mfd_dir);
+ return 0;
+}
+
+static int dump_task_state(int pid)
+{
+ char path[64];
+ int dump_fd;
+ void *mem;
+
+ printf("Dumping task image for %d\n", pid);
+ sprintf(path, "/proc/%d/kstate_dump", pid);
+ dump_fd = open(path, O_RDONLY);
+ if (dump_fd < 0) {
+ perror("Can't open dump file");
+ return 1;
+ }
+
+ mem = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
+ if (mem == MAP_FAILED) {
+ perror("Can't get mem");
+ return 1;
+ }
+
+ while (1) {
+ int r, w;
+
+ r = read(dump_fd, mem, 4096);
+ if (r == 0)
+ break;
+ if (r < 0) {
+ perror("Can't read dump file");
+ return 1;
+ }
+
+ w = 0;
+ while (w < r) {
+ int ret;
+
+ ret = write(core_img, mem + w, r - w);
+ if (ret <= 0) {
+ perror("Can't write core");
+ return 1;
+ }
+
+ w += ret;
+ }
+ }
+
+ munmap(mem, 4096);
+ close(dump_fd);
+
+ return 0;
+}
+
+static int dump_one_task(int pid, int stop)
+{
+ printf("Dumping task %d\n", pid);
+
+ if (prep_img_files(pid))
+ return 1;
+
+ if (stop && stop_task(pid))
+ goto err_task;
+
+ if (dump_task_files(pid))
+ goto err;
+
+ if (dump_task_ext_mm(pid))
+ goto err;
+
+ if (dump_task_state(pid))
+ goto err;
+
+ if (stop)
+ continue_task(pid);
+
+ printf("Dump is complete\n");
+ return 0;
+
+err:
+ if (stop)
+ continue_task(pid);
+err_task:
+ kill_imgfiles(pid);
+ return 1;
+}
+
+static int pstree_fd;
+static char big_tmp_str[4096];
+static int *pids, nr_pids;
+
+static char *get_children_pids(int pid)
+{
+ FILE *f;
+ int len;
+ char *ret, *tmp;
+
+ sprintf(big_tmp_str, "/proc/%d/status", pid);
+ f = fopen(big_tmp_str, "r");
+ if (f == NULL)
+ return NULL;
+
+ while ((fgets(big_tmp_str, sizeof(big_tmp_str), f)) != NULL) {
+ if (strncmp(big_tmp_str, "Children:", 9))
+ continue;
+
+ tmp = big_tmp_str + 10;
+ len = strlen(tmp);
+ ret = malloc(len + 1);
+ strcpy(ret, tmp);
+ if (len)
+ ret[len - 1] = ' ';
+
+ fclose(f);
+ return ret;
+ }
+
+ fclose(f);
+ return NULL;
+}
+
+static int dump_pid_and_children(int pid)
+{
+ struct pstree_entry e;
+ char *chlist, *tmp, *tmp2;
+
+ printf("\tReading %d children list\n", pid);
+ chlist = get_children_pids(pid);
+ if (chlist == NULL)
+ return 1;
+
+ printf("\t%d has children %s\n", pid, chlist);
+
+ e.pid = pid;
+ e.nr_children = 0;
+
+ pids = realloc(pids, (nr_pids + 1) * sizeof(int));
+ pids[nr_pids++] = e.pid;
+
+ tmp = chlist;
+ while ((tmp = strchr(tmp, ' ')) != NULL) {
+ tmp++;
+ e.nr_children++;
+ }
+
+ write(pstree_fd, &e, sizeof(e));
+ tmp = chlist;
+ while (1) {
+ __u32 cpid;
+
+ cpid = strtol(tmp, &tmp, 10);
+ if (cpid == 0)
+ break;
+ if (*tmp != ' ') {
+ fprintf(stderr, "Error in string with children!\n");
+ return 1;
+ }
+
+ write(pstree_fd, &cpid, sizeof(cpid));
+ tmp++;
+ }
+
+ tmp = chlist;
+ while ((tmp2 = strchr(tmp, ' ')) != NULL) {
+ *tmp2 = '\0';
+ if (dump_pid_and_children(atoi(tmp)))
+ return 1;
+ tmp = tmp2 + 1;
+ }
+
+ free(chlist);
+ return 0;
+}
+
+static int __dump_all_tasks(void)
+{
+ int i, pid;
+
+ printf("Dumping tasks' images for");
+ for (i = 0; i < nr_pids; i++)
+ printf(" %d", pids[i]);
+ printf("\n");
+
+ printf("Stopping tasks\n");
+ for (i = 0; i < nr_pids; i++)
+ if (stop_task(pids[i]))
+ goto err;
+
+ for (i = 0; i < nr_pids; i++) {
+ if (dump_one_task(pids[i], 0))
+ goto err;
+ }
+
+ printf("Resuming tasks\n");
+ for (i = 0; i < nr_pids; i++)
+ continue_task(pids[i]);
+
+ return 0;
+
+err:
+ for (i = 0; i < nr_pids; i++)
+ continue_task(pids[i]);
+ return 1;
+
+}
+
+static int dump_all_tasks(int pid)
+{
+ char *chlist;
+ __u32 type;
+
+ pids = NULL;
+ nr_pids = 0;
+
+ printf("Dumping process tree, start from %d\n", pid);
+
+ sprintf(big_tmp_str, "pstree-%d.img", pid);
+ pstree_fd = open(big_tmp_str, O_WRONLY | O_CREAT | O_EXCL, 0600);
+ if (pstree_fd < 0) {
+ perror("Can't create pstree");
+ return 1;
+ }
+
+ type = PSTREE_MAGIC;
+ write(pstree_fd, &type, sizeof(type));
+
+ if (dump_pid_and_children(pid))
+ return 1;
+
+ close(pstree_fd);
+
+ return __dump_all_tasks();
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 3)
+ goto usage;
+ if (argv[1][0] != '-')
+ goto usage;
+ if (argv[1][1] == 'p')
+ return dump_one_task(atoi(argv[2]), 1);
+ if (argv[1][1] == 't')
+ return dump_all_tasks(atoi(argv[2]));
+
+usage:
+ printf("Usage: %s (-p|-t) <pid>\n", argv[0]);
+ return 1;
+}
diff --git a/xemul/cr-restore.c b/xemul/cr-restore.c
new file mode 100644
index 000000000..d8cedb01f
--- /dev/null
+++ b/xemul/cr-restore.c
@@ -0,0 +1,1115 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <dirent.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <linux/kdev_t.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/sendfile.h>
+
+#define PAGE_SIZE 4096
+
+#include <linux/types.h>
+#include "img_structs.h"
+#include "binfmt_img.h"
+
+struct fmap_fd {
+ unsigned long start;
+ int fd;
+ struct fmap_fd *next;
+};
+
+static struct fmap_fd *fmap_fds;
+
+struct shmem_info {
+ unsigned long start;
+ unsigned long end;
+ unsigned long id;
+ int pid;
+ int real_pid;
+};
+
+static struct shmem_info *shmems;
+static int nr_shmems;
+
+struct pipes_info {
+ unsigned int id;
+ int pid;
+ int real_pid;
+ int read_fd;
+ int write_fd;
+ int users;
+};
+
+static struct pipes_info *pipes;
+static int nr_pipes;
+
+static int restore_task_with_children(int my_pid, char *pstree_path);
+
+#define CLONE_CHILD_USEPID 0x02000000
+
+static void show_saved_shmems(void)
+{
+ int i;
+
+ printf("\tSaved shmems:\n");
+ for (i = 0; i < nr_shmems; i++)
+ printf("\t\t%016lx %lx %d\n", shmems[i].start, shmems[i].id, shmems[i].pid);
+}
+
+static void show_saved_pipes(void)
+{
+ int i;
+
+ printf("\tSaved pipes:\n");
+ for (i = 0; i < nr_pipes; i++)
+ printf("\t\t%x -> %d\n", pipes[i].id, pipes[i].pid);
+}
+
+static struct shmem_info *search_shmem(unsigned long addr, unsigned long id)
+{
+ int i;
+
+ for (i = 0; i < nr_shmems; i++) {
+ struct shmem_info *si;
+
+ si = shmems + i;
+ if (si->start <= addr && si->end >= addr && si->id == id)
+ return si;
+ }
+
+ return NULL;
+}
+
+static struct pipes_info *search_pipes(unsigned int pipeid)
+{
+ int i;
+
+ for (i = 0; i < nr_pipes; i++) {
+ struct pipes_info *pi;
+
+ pi = pipes + i;
+ if (pi->id == pipeid)
+ return pi;
+ }
+
+ return NULL;
+}
+
+static void shmem_update_real_pid(int vpid, int rpid)
+{
+ int i;
+
+ for (i = 0; i < nr_shmems; i++)
+ if (shmems[i].pid == vpid)
+ shmems[i].real_pid = rpid;
+}
+
+static int shmem_wait_and_open(struct shmem_info *si)
+{
+ /* FIXME - not good */
+ char path[128];
+ unsigned long time = 1000;
+
+ sleep(1);
+
+ while (si->real_pid == 0)
+ usleep(time);
+
+ sprintf(path, "/proc/%d/mfd/0x%lx", si->real_pid, si->start);
+ while (1) {
+ int ret;
+
+ ret = open(path, O_RDWR);
+ if (ret > 0)
+ return ret;
+
+ if (ret < 0 && errno != ENOENT) {
+ perror(" Can't stat shmem");
+ return -1;
+ }
+
+ printf("Waiting for [%s] to appear\n", path);
+ if (time < 20000000)
+ time <<= 1;
+ usleep(time);
+ }
+}
+
+static int try_to_add_shmem(int pid, struct shmem_entry *e)
+{
+ int i;
+
+ for (i = 0; i < nr_shmems; i++) {
+ if (shmems[i].start != e->start || shmems[i].id != e->shmid)
+ continue;
+
+ if (shmems[i].end != e->end) {
+ printf("Bogus shmem\n");
+ return 1;
+ }
+
+ if (shmems[i].pid > pid)
+ shmems[i].pid = pid;
+
+ return 0;
+ }
+
+ if ((nr_shmems + 1) * sizeof(struct shmem_info) >= 4096) {
+ printf("OOM storing shmems\n");
+ return 1;
+ }
+
+ shmems[nr_shmems].start = e->start;
+ shmems[nr_shmems].end = e->end;
+ shmems[nr_shmems].id = e->shmid;
+ shmems[nr_shmems].pid = pid;
+ shmems[nr_shmems].real_pid = 0;
+ nr_shmems++;
+
+ return 0;
+}
+
+static int try_to_add_pipe(int pid, struct pipes_entry *e, int p_fd)
+{
+ int i;
+
+ for (i = 0; i < nr_pipes; i++) {
+ if (pipes[i].id != e->pipeid)
+ continue;
+
+ if (pipes[i].pid > pid)
+ pipes[i].pid = pid;
+ pipes[i].users++;
+
+ return 0;
+ }
+
+ if ((nr_pipes + 1) * sizeof(struct pipes_info) >= 4096) {
+ printf("OOM storing pipes\n");
+ return 1;
+ }
+
+ pipes[nr_pipes].id = e->pipeid;
+ pipes[nr_pipes].pid = pid;
+ pipes[nr_pipes].real_pid = 0;
+ pipes[nr_pipes].read_fd = 0;
+ pipes[nr_pipes].write_fd = 0;
+ pipes[nr_pipes].users = 1;
+ nr_pipes++;
+
+ return 0;
+}
+
+static int prepare_shmem_pid(int pid)
+{
+ char path[64];
+ int sh_fd;
+ __u32 type = 0;
+
+ sprintf(path, "shmem-%d.img", pid);
+ sh_fd = open(path, O_RDONLY);
+ if (sh_fd < 0) {
+ perror("Can't open shmem info");
+ return 1;
+ }
+
+ read(sh_fd, &type, sizeof(type));
+ if (type != SHMEM_MAGIC) {
+ perror("Bad shmem magic");
+ return 1;
+ }
+
+ while (1) {
+ struct shmem_entry e;
+ int ret;
+
+ ret = read(sh_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ perror("Can't read shmem entry");
+ return 1;
+ }
+
+ if (try_to_add_shmem(pid, &e))
+ return 1;
+ }
+
+ close(sh_fd);
+ return 0;
+}
+
+static int prepare_pipes_pid(int pid)
+{
+ char path[64];
+ int p_fd;
+ __u32 type = 0;
+
+ sprintf(path, "pipes-%d.img", pid);
+ p_fd = open(path, O_RDONLY);
+ if (p_fd < 0) {
+ perror("Can't open pipes image");
+ return 1;
+ }
+
+ read(p_fd, &type, sizeof(type));
+ if (type != PIPES_MAGIC) {
+ perror("Bad pipes magin");
+ return 1;
+ }
+
+ while (1) {
+ struct pipes_entry e;
+ int ret;
+
+ ret = read(p_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ fprintf(stderr, "Read pipes for %s failed %d of %d read\n",
+ path, ret, sizeof(e));
+ perror("Can't read pipes entry");
+ return 1;
+ }
+
+ if (try_to_add_pipe(pid, &e, p_fd))
+ return 1;
+
+ lseek(p_fd, e.bytes, SEEK_CUR);
+ }
+
+ close(p_fd);
+ return 0;
+}
+
+static int prepare_shared(int ps_fd)
+{
+ printf("Preparing info about shared resources\n");
+
+ nr_shmems = 0;
+ shmems = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
+ if (shmems == MAP_FAILED) {
+ perror("Can't map shmems");
+ return 1;
+ }
+
+ pipes = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
+ if (pipes == MAP_FAILED) {
+ perror("Can't map pipes");
+ return 1;
+ }
+
+ while (1) {
+ struct pstree_entry e;
+ int ret;
+
+ ret = read(ps_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+
+ if (ret != sizeof(e)) {
+ perror("Can't read ps");
+ return 1;
+ }
+
+ if (prepare_shmem_pid(e.pid))
+ return 1;
+
+ if (prepare_pipes_pid(e.pid))
+ return 1;
+
+ lseek(ps_fd, e.nr_children * sizeof(__u32), SEEK_CUR);
+ }
+
+ lseek(ps_fd, sizeof(__u32), SEEK_SET);
+
+ show_saved_shmems();
+ show_saved_pipes();
+
+ return 0;
+}
+
+static struct fmap_fd *pop_fmap_fd(unsigned long start)
+{
+ struct fmap_fd **p, *r;
+
+ printf("Looking for %lx : ", start);
+
+ for (p = &fmap_fds; *p != NULL; p = &(*p)->next) {
+ if ((*p)->start != start)
+ continue;
+
+ r = *p;
+ *p = r->next;
+ printf("found\n");
+ return r;
+ }
+
+ printf("not found\n");
+ return NULL;
+}
+
+static int open_fe_fd(struct fdinfo_entry *fe, int fd)
+{
+ char path[PATH_MAX];
+ int tmp;
+
+ if (read(fd, path, fe->len) != fe->len) {
+ fprintf(stderr, "Error reading path");
+ return -1;
+ }
+
+ path[fe->len] = '\0';
+
+ tmp = open(path, fe->flags);
+ if (tmp < 0) {
+ perror("Can't open file");
+ return -1;
+ }
+
+ lseek(tmp, fe->pos, SEEK_SET);
+
+ return tmp;
+}
+
+static int reopen_fd(int old_fd, int new_fd)
+{
+ int tmp;
+
+ if (old_fd != new_fd) {
+ tmp = dup2(old_fd, new_fd);
+ if (tmp < 0)
+ return tmp;
+
+ close(old_fd);
+ }
+
+ return new_fd;
+}
+
+static int open_fd(int pid, struct fdinfo_entry *fe, int *cfd)
+{
+ int fd, tmp;
+
+ if (*cfd == (int)fe->addr) {
+ tmp = dup(*cfd);
+ if (tmp < 0) {
+ perror("Can't dup file");
+ return 1;
+ }
+
+ printf("%s: Dup for %d\n", __func__, tmp);
+
+ *cfd = tmp;
+ }
+
+ tmp = open_fe_fd(fe, *cfd);
+ if (tmp < 0)
+ return 1;
+
+ fd = reopen_fd(tmp, (int)fe->addr);
+ if (fd < 0) {
+ perror("Can't dup");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int open_fmap(int pid, struct fdinfo_entry *fe, int fd)
+{
+ int tmp;
+ struct fmap_fd *new;
+
+ tmp = open_fe_fd(fe, fd);
+ if (tmp < 0)
+ return 1;
+
+ printf("%d:\t\tWill map %lx to %d\n", pid, (unsigned long)fe->addr, tmp);
+ new = malloc(sizeof(*new));
+ new->start = fe->addr;
+ new->fd = tmp;
+ new->next = fmap_fds;
+ fmap_fds = new;
+
+ return 0;
+}
+
+static int prepare_fds(int pid)
+{
+ __u32 mag;
+ char path[64];
+ int fdinfo_fd;
+
+ printf("%d: Opening files\n", pid);
+
+ sprintf(path, "fdinfo-%d.img", pid);
+ fdinfo_fd = open(path, O_RDONLY);
+ if (fdinfo_fd < 0) {
+ perror("Can't open fdinfo");
+ return 1;
+ }
+
+ read(fdinfo_fd, &mag, 4);
+ if (mag != FDINFO_MAGIC) {
+ fprintf(stderr, "Bad file\n");
+ return 1;
+ }
+
+ while (1) {
+ int ret;
+ struct fdinfo_entry fe;
+
+ ret = read(fdinfo_fd, &fe, sizeof(fe));
+ if (ret == 0) {
+ close(fdinfo_fd);
+ return 0;
+ }
+
+ if (ret < 0) {
+ perror("Can't read file");
+ return 1;
+ }
+ if (ret != sizeof(fe)) {
+ fprintf(stderr, "Error reading\n");
+ return 1;
+ }
+
+ printf("\t%d: Got fd for %lx type %d namelen %d\n", pid,
+ (unsigned long)fe.addr, fe.type, fe.len);
+ switch (fe.type) {
+ case FDINFO_FD:
+ if (open_fd(pid, &fe, &fdinfo_fd))
+ return 1;
+
+ break;
+ case FDINFO_MAP:
+ if (open_fmap(pid, &fe, fdinfo_fd))
+ return 1;
+
+ break;
+ default:
+ fprintf(stderr, "Some bullshit in a file\n");
+ return 1;
+ }
+ }
+}
+
+struct shmem_to_id {
+ unsigned long addr;
+ unsigned long end;
+ unsigned long id;
+ struct shmem_to_id *next;
+};
+
+static struct shmem_to_id *my_shmem_ids;
+
+static unsigned long find_shmem_id(unsigned long addr)
+{
+ struct shmem_to_id *si;
+
+ for (si = my_shmem_ids; si != NULL; si = si->next)
+ if (si->addr <= addr && si->end >= addr)
+ return si->id;
+
+ return 0;
+}
+
+static void save_shmem_id(struct shmem_entry *e)
+{
+ struct shmem_to_id *si;
+
+ si = malloc(sizeof(*si));
+ si->addr = e->start;
+ si->end = e->end;
+ si->id = e->shmid;
+ si->next = my_shmem_ids;
+ my_shmem_ids = si;
+}
+
+static int prepare_shmem(int pid)
+{
+ char path[64];
+ int sh_fd;
+ __u32 type = 0;
+
+ sprintf(path, "shmem-%d.img", pid);
+ sh_fd = open(path, O_RDONLY);
+ if (sh_fd < 0) {
+ perror("Can't open shmem info");
+ return 1;
+ }
+
+ read(sh_fd, &type, sizeof(type));
+ if (type != SHMEM_MAGIC) {
+ perror("Bad shmem magic");
+ return 1;
+ }
+
+ while (1) {
+ struct shmem_entry e;
+ int ret;
+
+ ret = read(sh_fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ perror("Can't read shmem entry");
+ return 1;
+ }
+
+ save_shmem_id(&e);
+ }
+
+ close(sh_fd);
+ return 0;
+}
+
+static int try_fixup_file_map(int pid, struct binfmt_vma_image *vi, int fd)
+{
+ struct fmap_fd *fmfd;
+
+ fmfd = pop_fmap_fd(vi->start);
+ if (fmfd != NULL) {
+ printf("%d: Fixing %lx vma to %d fd\n", pid, vi->start, fmfd->fd);
+ lseek(fd, -sizeof(*vi), SEEK_CUR);
+ printf("%d: Wrote %d\n", fmfd->fd);
+ vi->fd = fmfd->fd;
+ if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) {
+ perror("Can't write img");
+ return 1;
+ }
+ free(fmfd);
+ }
+
+ return 0;
+}
+
+static int try_fixup_shared_map(int pid, struct binfmt_vma_image *vi, int fd)
+{
+ struct shmem_info *si;
+ unsigned long id;
+
+ id = find_shmem_id(vi->start);
+ if (id == 0)
+ return 0;
+
+ si = search_shmem(vi->start, id);
+ printf("%d: Search for %016lx shmem %p/%d\n", pid, vi->start, si, si ? si->pid : -1);
+
+ if (si == NULL) {
+ fprintf(stderr, "Can't find my shmem %016lx\n", vi->start);
+ return 1;
+ }
+
+ if (si->pid != pid) {
+ int sh_fd;
+
+ sh_fd = shmem_wait_and_open(si);
+ printf("%d: Fixing %lx vma to %x/%d shmem -> %d\n", pid, vi->start, si->id, si->pid, sh_fd);
+ if (fd < 0) {
+ perror("Can't open shmem");
+ return 1;
+ }
+
+ lseek(fd, -sizeof(*vi), SEEK_CUR);
+ vi->fd = sh_fd;
+ if (write(fd, vi, sizeof(*vi)) != sizeof(*vi)) {
+ perror("Can't write img");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int fixup_vma_fds(int pid, int fd)
+{
+ int offset =
+ sizeof(struct binfmt_img_header) +
+ sizeof(struct binfmt_regs_image) +
+ sizeof(struct binfmt_mm_image);
+
+ printf("Seek for: %li bytes\n", offset);
+ lseek(fd, offset, SEEK_SET);
+
+ while (1) {
+ struct binfmt_vma_image vi;
+
+ if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) {
+ perror("Can't read");
+ return 1;
+ }
+
+ if (vi.start == 0 && vi.end == 0)
+ return 0;
+
+ printf("%d: Fixing %016lx-%016lx %016lx vma\n", pid, vi.start, vi.end, vi.pgoff);
+ if (try_fixup_file_map(pid, &vi, fd))
+ return 1;
+
+ if (try_fixup_shared_map(pid, &vi, fd))
+ return 1;
+ }
+}
+
+static inline int should_restore_page(int pid, unsigned long vaddr)
+{
+ struct shmem_info *si;
+ unsigned long id;
+
+ id = find_shmem_id(vaddr);
+ if (id == 0)
+ return 1;
+
+ si = search_shmem(vaddr, id);
+ return si->pid == pid;
+}
+
+static int fixup_pages_data(int pid, int fd)
+{
+ char path[128];
+ int shfd;
+ __u32 mag;
+ __u64 vaddr;
+
+ sprintf(path, "pages-%d.img", pid);
+ shfd = open(path, O_RDONLY);
+ if (shfd < 0) {
+ perror("Can't open shmem image");
+ return 1;
+ }
+
+ read(shfd, &mag, sizeof(mag));
+ if (mag != PAGES_MAGIC) {
+ fprintf(stderr, "Bad shmem image\n");
+ return 1;
+ }
+
+ lseek(fd, -sizeof(struct binfmt_page_image), SEEK_END);
+ read(fd, &vaddr, sizeof(vaddr));
+ if (vaddr != 0) {
+ printf("SHIT %lx\n", (unsigned long)vaddr);
+ return 1;
+ }
+ lseek(fd, -sizeof(struct binfmt_page_image), SEEK_END);
+
+ while (1) {
+ int ret;
+
+ ret = read(shfd, &vaddr, sizeof(vaddr));
+ if (ret == 0)
+ break;
+
+ if (ret < 0 || ret != sizeof(vaddr)) {
+ perror("Can't read vaddr");
+ return 1;
+ }
+
+ if (vaddr == 0)
+ break;
+
+ if (!should_restore_page(pid, vaddr)) {
+ lseek(shfd, PAGE_SIZE, SEEK_CUR);
+ continue;
+ }
+
+// printf("Copy page %lx to image\n", (unsigned long)vaddr);
+ write(fd, &vaddr, sizeof(vaddr));
+ sendfile(fd, shfd, NULL, PAGE_SIZE);
+ }
+
+ close(shfd);
+ vaddr = 0;
+ write(fd, &vaddr, sizeof(vaddr));
+ return 0;
+}
+
+static int prepare_image_maps(int fd, int pid)
+{
+ printf("%d: Fixing maps before executing image\n", pid);
+
+ if (fixup_vma_fds(pid, fd))
+ return 1;
+
+ if (fixup_pages_data(pid, fd))
+ return 1;
+
+ close(fd);
+ return 0;
+}
+
+static int execute_image(int pid)
+{
+ char path[128];
+ int fd, fd_new;
+ struct stat buf;
+
+ sprintf(path, "core-%d.img", pid);
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ perror("Can't open exec image");
+ return 1;
+ }
+
+ if (fstat(fd, &buf)) {
+ perror("Can't stat");
+ return 1;
+ }
+
+ sprintf(path, "core-%d.img.out", pid);
+ fd_new = open(path, O_RDWR | O_CREAT | O_EXCL, 0700);
+ if (fd_new < 0) {
+ perror("Can't open new image");
+ return 1;
+ }
+
+ printf("%d: Preparing execution image (%li bytes)\n", pid, buf.st_size);
+ sendfile(fd_new, fd, NULL, buf.st_size);
+ close(fd);
+
+ if (fchmod(fd_new, 0700)) {
+ perror("Can't prepare exec image");
+ return 1;
+ }
+
+ if (prepare_image_maps(fd_new, pid))
+ return 1;
+
+ sync();
+
+ printf("%d/%d EXEC IMAGE\n", pid, getpid());
+ return execl(path, path, NULL);
+}
+
+static int create_pipe(int pid, struct pipes_entry *e, struct pipes_info *pi, int pipes_fd)
+{
+ int pfd[2], tmp;
+ unsigned long time = 1000;
+
+ printf("\t%d: Creating pipe %x\n", pid, e->pipeid);
+
+ if (pipe(pfd) < 0) {
+ perror("Can't create pipe");
+ return 1;
+ }
+
+ if (e->bytes) {
+ printf("\t%d: Splicing data to %d\n", pid, pfd[1]);
+
+ tmp = splice(pipes_fd, NULL, pfd[1], NULL, e->bytes, 0);
+ if (tmp != e->bytes) {
+ fprintf(stderr, "Wanted to restore %ld bytes, but got %ld\n",
+ e->bytes, tmp);
+ if (tmp < 0)
+ perror("Error splicing data");
+ return 1;
+ }
+ }
+
+ pi->read_fd = pfd[0];
+ pi->write_fd = pfd[1];
+ pi->real_pid = getpid();
+
+ printf("\t%d: Done, waiting for others on %d pid with r:%d w:%d\n",
+ pid, pi->real_pid, pfd[0], pfd[1]);
+
+ while (1) {
+ if (pi->users == 1) /* only I left */
+ break;
+
+ printf("\t%d: Waiting for %x pipe to attach (%d users left)\n",
+ pid, e->pipeid, pi->users - 1);
+ if (time < 20000000)
+ time <<= 1;
+ usleep(time);
+ }
+
+ printf("\t%d: All is ok - reopening pipe for %d\n", pid, e->fd);
+ if (e->flags & O_WRONLY) {
+ close(pfd[0]);
+ tmp = reopen_fd(pfd[1], e->fd);
+ } else {
+ close(pfd[1]);
+ tmp = reopen_fd(pfd[0], e->fd);
+ }
+
+ if (tmp < 0) {
+ perror("Can't dup pipe fd");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int attach_pipe(int pid, struct pipes_entry *e, struct pipes_info *pi)
+{
+ char path[128];
+ int tmp, fd;
+
+ printf("\t%d: Wating for pipe %x to appear\n", pid, e->pipeid);
+
+ while (pi->real_pid == 0)
+ usleep(1000);
+
+ if (e->flags & O_WRONLY)
+ tmp = pi->write_fd;
+ else
+ tmp = pi->read_fd;
+
+ sprintf(path, "/proc/%d/fd/%d", pi->real_pid, tmp);
+ printf("\t%d: Attaching pipe %s\n", pid, path);
+
+ fd = open(path, e->flags);
+ if (fd < 0) {
+ perror("Can't attach pipe");
+ return 1;
+ }
+
+ printf("\t%d: Done, reopening for %d\n", pid, e->fd);
+ pi->users--;
+ tmp = reopen_fd(fd, e->fd);
+ if (tmp < 0) {
+ perror("Can't dup to attach pipe");
+ return 1;
+ }
+
+ return 0;
+
+}
+
+static int open_pipe(int pid, struct pipes_entry *e, int *pipes_fd)
+{
+ struct pipes_info *pi;
+
+ printf("\t%d: Opening pipe %x on fd %d\n", pid, e->pipeid, e->fd);
+ if (e->fd == *pipes_fd) {
+ int tmp;
+
+ tmp = dup(*pipes_fd);
+ if (tmp < 0) {
+ perror("Can't dup file");
+ return 1;
+ }
+
+ *pipes_fd = tmp;
+ }
+
+ pi = search_pipes(e->pipeid);
+ if (pi == NULL) {
+ fprintf(stderr, "BUG: can't find my pipe %x\n", e->pipeid);
+ return 1;
+ }
+
+ if (pi->pid == pid)
+ return create_pipe(pid, e, pi, *pipes_fd);
+ else
+ return attach_pipe(pid, e, pi);
+}
+
+static int prepare_pipes(int pid)
+{
+ char path[64];
+ int pipes_fd;
+ __u32 type = 0;
+
+ printf("%d: Opening pipes\n", pid);
+
+ sprintf(path, "pipes-%d.img", pid);
+ pipes_fd = open(path, O_RDONLY);
+ if (pipes_fd < 0) {
+ perror("Can't open pipes img");
+ return 1;
+ }
+
+ read(pipes_fd, &type, sizeof(type));
+ if (type != PIPES_MAGIC) {
+ perror("Bad pipes file");
+ return 1;
+ }
+
+ while (1) {
+ struct pipes_entry e;
+ int ret;
+
+ ret = read(pipes_fd, &e, sizeof(e));
+ if (ret == 0) {
+ close(pipes_fd);
+ return 0;
+ }
+ if (ret != sizeof(e)) {
+ perror("Bad pipes entry");
+ return 1;
+ }
+
+ if (open_pipe(pid, &e, &pipes_fd))
+ return 1;
+ }
+}
+
+static int restore_one_task(int pid)
+{
+ printf("%d: Restoring resources\n", pid);
+
+ if (prepare_pipes(pid))
+ return 1;
+
+ if (prepare_fds(pid))
+ return 1;
+
+ if (prepare_shmem(pid))
+ return 1;
+
+ return execute_image(pid);
+}
+
+static int do_child(void *arg)
+{
+ return restore_task_with_children(getpid(), arg);
+}
+
+static inline int fork_with_pid(int pid, char *pstree_path)
+{
+ int ret = 0;
+ void *stack;
+
+ stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0);
+ if (stack == MAP_FAILED)
+ return -1;
+
+ stack += 4 * 4096;
+ ret = clone(do_child, stack, SIGCHLD | CLONE_CHILD_USEPID, pstree_path, NULL, NULL, &pid);
+
+ return ret;
+}
+
+static int restore_task_with_children(int my_pid, char *pstree_path)
+{
+ int *pids;
+ int fd, ret, i;
+ struct pstree_entry e;
+
+ printf("%d: Starting restore\n", my_pid);
+
+ fd = open(pstree_path, O_RDONLY);
+ if (fd < 0) {
+ perror("Can't reopen pstree image");
+ exit(1);
+ }
+
+ lseek(fd, sizeof(__u32), SEEK_SET);
+ while (1) {
+ ret = read(fd, &e, sizeof(e));
+ if (ret != sizeof(e)) {
+ fprintf(stderr, "%d: Read returned %d\n", my_pid, ret);
+ if (ret < 0)
+ perror("Can't read pstree");
+ exit(1);
+ }
+
+ if (e.pid != my_pid) {
+ lseek(fd, e.nr_children * sizeof(__u32), SEEK_CUR);
+ continue;
+ }
+
+ break;
+ }
+
+ if (e.nr_children > 0) {
+ i = e.nr_children * sizeof(int);
+ pids = malloc(i);
+ ret = read(fd, pids, i);
+ if (ret != i) {
+ perror("Can't read children pids");
+ exit(1);
+ }
+
+ close(fd);
+
+ printf("%d: Restoring %d children:\n", my_pid, e.nr_children);
+ for (i = 0; i < e.nr_children; i++) {
+ printf("\tFork %d from %d\n", pids[i], my_pid);
+ ret = fork_with_pid(pids[i], pstree_path);
+ if (ret < 0) {
+ perror("Can't fork kid");
+ exit(1);
+ }
+ }
+ } else
+ close(fd);
+
+ shmem_update_real_pid(my_pid, getpid());
+
+ return restore_one_task(my_pid);
+}
+
+static int restore_root_task(char *pstree_path, int fd)
+{
+ struct pstree_entry e;
+ int ret;
+
+ ret = read(fd, &e, sizeof(e));
+ if (ret != sizeof(e)) {
+ perror("Can't read root pstree entry");
+ return 1;
+ }
+
+ close(fd);
+
+ printf("Forking root with %d pid\n", e.pid);
+ ret = fork_with_pid(e.pid, pstree_path);
+ if (ret < 0) {
+ perror("Can't fork root");
+ return 1;
+ }
+
+ wait(NULL);
+ return 0;
+}
+
+static int restore_all_tasks(char *pid)
+{
+ char path[128];
+ int pstree_fd;
+ __u32 type = 0;
+
+ sprintf(path, "pstree-%s.img", pid);
+ pstree_fd = open(path, O_RDONLY);
+ if (pstree_fd < 0) {
+ perror("Can't open pstree image");
+ return 1;
+ }
+
+ read(pstree_fd, &type, sizeof(type));
+ if (type != PSTREE_MAGIC) {
+ perror("Bad pstree magic");
+ return 1;
+ }
+
+ if (prepare_shared(pstree_fd))
+ return 1;
+
+ return restore_root_task(path, pstree_fd);
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 3)
+ goto usage;
+ if (argv[1][0] != '-')
+ goto usage;
+ if (argv[1][1] == 'p')
+ return restore_one_task(atoi(argv[2]));
+ if (argv[1][1] == 't')
+ return restore_all_tasks(argv[2]);
+
+usage:
+ printf("Usage: %s (-t|-p) <pid>\n", argv[0]);
+ return 1;
+}
diff --git a/xemul/img-show.c b/xemul/img-show.c
new file mode 100644
index 000000000..4d1ad22f8
--- /dev/null
+++ b/xemul/img-show.c
@@ -0,0 +1,354 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <linux/types.h>
+#include <string.h>
+#include "img_structs.h"
+#include "binfmt_img.h"
+
+static int show_fdinfo(int fd)
+{
+ char data[1024];
+ struct fdinfo_entry e;
+
+ while (1) {
+ int ret;
+
+ ret = read(fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ perror("Can't read");
+ return 1;
+ }
+
+ ret = read(fd, data, e.len);
+ if (ret != e.len) {
+ perror("Can't read");
+ return 1;
+ }
+
+ data[e.len] = '\0';
+ switch (e.type) {
+ case FDINFO_FD:
+ printf("fd %d [%s] pos %lx flags %o\n", (int)e.addr, data, e.pos, e.flags);
+ break;
+ case FDINFO_MAP:
+ printf("map %lx [%s] flags %o\n", e.addr, data, e.flags);
+ break;
+ default:
+ fprintf(stderr, "Unknown fdinfo entry type %d\n", e.type);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+#define PAGE_SIZE 4096
+
+static int show_mem(int fd)
+{
+ __u64 vaddr;
+ unsigned int data[2];
+
+ while (1) {
+ if (read(fd, &vaddr, 8) == 0)
+ break;
+ if (vaddr == 0)
+ break;
+
+ read(fd, &data[0], sizeof(unsigned int));
+ lseek(fd, PAGE_SIZE - 2 * sizeof(unsigned int), SEEK_CUR);
+ read(fd, &data[1], sizeof(unsigned int));
+
+ printf("\tpage 0x%lx [%x...%x]\n", (unsigned long)vaddr, data[0], data[1]);
+ }
+
+ return 0;
+}
+
+static int show_pages(int fd)
+{
+ return show_mem(fd);
+}
+
+static int show_shmem(int fd)
+{
+ int r;
+ struct shmem_entry e;
+
+ while (1) {
+ r = read(fd, &e, sizeof(e));
+ if (r == 0)
+ return 0;
+ if (r != sizeof(e)) {
+ perror("Can't read shmem entry");
+ return 1;
+ }
+
+ printf("%016lx-%016lx %016x\n", e.start, e.end, e.shmid);
+ }
+}
+
+static char *segval(__u16 seg)
+{
+ switch (seg) {
+ case CKPT_X86_SEG_NULL: return "nul";
+ case CKPT_X86_SEG_USER32_CS: return "cs32";
+ case CKPT_X86_SEG_USER32_DS: return "ds32";
+ case CKPT_X86_SEG_USER64_CS: return "cs64";
+ case CKPT_X86_SEG_USER64_DS: return "ds64";
+ }
+
+ if (seg & CKPT_X86_SEG_TLS)
+ return "tls";
+ if (seg & CKPT_X86_SEG_LDT)
+ return "ldt";
+
+ return "[unknown]";
+}
+
+static int show_regs(int fd)
+{
+ struct binfmt_regs_image ri;
+
+ if (read(fd, &ri, sizeof(ri)) != sizeof(ri)) {
+ perror("Can't read registers from image");
+ return 1;
+ }
+
+ printf("Registers:\n");
+
+ printf("\tr15: %016lx\n", ri.r.r15);
+ printf("\tr14: %016lx\n", ri.r.r14);
+ printf("\tr13: %016lx\n", ri.r.r13);
+ printf("\tr12: %016lx\n", ri.r.r12);
+ printf("\tr11: %016lx\n", ri.r.r11);
+ printf("\tr10: %016lx\n", ri.r.r10);
+ printf("\tr9: %016lx\n", ri.r.r9);
+ printf("\tr8: %016lx\n", ri.r.r8);
+ printf("\tax: %016lx\n", ri.r.ax);
+ printf("\torig_ax: %016lx\n", ri.r.orig_ax);
+ printf("\tbx: %016lx\n", ri.r.bx);
+ printf("\tcx: %016lx\n", ri.r.cx);
+ printf("\tdx: %016lx\n", ri.r.dx);
+ printf("\tsi: %016lx\n", ri.r.si);
+ printf("\tdi: %016lx\n", ri.r.di);
+ printf("\tip: %016lx\n", ri.r.ip);
+ printf("\tflags: %016lx\n", ri.r.flags);
+ printf("\tbp: %016lx\n", ri.r.bp);
+ printf("\tsp: %016lx\n", ri.r.sp);
+ printf("\tgs: %016lx\n", ri.r.gs);
+ printf("\tfs: %016lx\n", ri.r.fs);
+ printf("\tgsindex: %s\n", segval(ri.r.gsindex));
+ printf("\tfsindex: %s\n", segval(ri.r.fsindex));
+ printf("\tcs: %s\n", segval(ri.r.cs));
+ printf("\tss: %s\n", segval(ri.r.ss));
+ printf("\tds: %s\n", segval(ri.r.ds));
+ printf("\tes: %s\n", segval(ri.r.es));
+
+ printf("\ttls0 %016lx\n", ri.r.tls[0]);
+ printf("\ttls1 %016lx\n", ri.r.tls[1]);
+ printf("\ttls2 %016lx\n", ri.r.tls[2]);
+
+ return 0;
+}
+
+static int show_mm(int fd, unsigned long *stack)
+{
+ struct binfmt_mm_image mi;
+
+ if (read(fd, &mi, sizeof(mi)) != sizeof(mi)) {
+ perror("Can't read mm from image");
+ return 1;
+ }
+
+ printf("MM:\n");
+ printf("\tflags: %016lx\n", mi.flags);
+ printf("\tdef_flags: %016lx\n", mi.def_flags);
+ printf("\tstart_code: %016lx\n", mi.start_code);
+ printf("\tend_code: %016lx\n", mi.end_code);
+ printf("\tstart_data: %016lx\n", mi.start_data);
+ printf("\tend_data: %016lx\n", mi.end_data);
+ printf("\tstart_brk: %016lx\n", mi.start_brk);
+ printf("\tbrk: %016lx\n", mi.brk);
+ printf("\tstart_stack: %016lx\n", mi.start_stack);
+ printf("\targ_start: %016lx\n", mi.arg_start);
+ printf("\targ_end: %016lx\n", mi.arg_end);
+ printf("\tenv_start: %016lx\n", mi.env_start);
+ printf("\tenv_end: %016lx\n", mi.env_end);
+
+ *stack = mi.start_stack;
+
+ return 0;
+}
+
+static int show_vmas(int fd, unsigned long stack)
+{
+ struct binfmt_vma_image vi;
+
+ printf("VMAs:\n");
+ while (1) {
+ char *note = "";
+
+ if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) {
+ perror("Can't read vma from image");
+ return 1;
+ }
+
+ if (vi.start == 0 && vi.end == 0)
+ return 0;
+
+ if (vi.start <= stack && vi.end >= stack)
+ note = "[stack]";
+
+ printf("\t%016lx-%016lx file %d %016lx prot %x flags %x %s\n",
+ vi.start, vi.end, vi.fd, vi.pgoff,
+ vi.prot, vi.flags, note);
+ }
+}
+
+static int show_privmem(int fd)
+{
+ printf("Pages:\n");
+ return show_mem(fd);
+}
+
+static int show_core(int fd)
+{
+ __u32 version = 0;
+ unsigned long stack;
+
+ read(fd, &version, 4);
+ if (version != BINFMT_IMG_VERS_0) {
+ printf("Unsupported version %d\n", version);
+ return 1;
+ }
+
+ /* the pad */
+ read(fd, &version, 4);
+
+ printf("Showing version 0\n");
+
+ if (show_regs(fd))
+ return 1;
+
+ if (show_mm(fd, &stack))
+ return 1;
+
+ if (show_vmas(fd, stack))
+ return 1;
+
+ if (show_privmem(fd))
+ return 1;
+
+ return 0;
+}
+
+static int show_pstree(int fd)
+{
+ int ret;
+ struct pstree_entry e;
+
+ while (1) {
+ int i;
+ __u32 *ch;
+
+ ret = read(fd, &e, sizeof(e));
+ if (ret == 0)
+ return 0;
+ if (ret != sizeof(e)) {
+ perror("Can't read processes entry");
+ return 1;
+ }
+
+ printf("%d:", e.pid);
+ i = e.nr_children * sizeof(__u32);
+ ch = malloc(i);
+ ret = read(fd, ch, i);
+ if (ret != i) {
+ perror("Can't read children list");
+ return 1;
+ }
+
+ for (i = 0; i < e.nr_children; i++)
+ printf(" %d", ch[i]);
+ printf("\n");
+ }
+}
+
+static int show_pipes(int fd)
+{
+ struct pipes_entry e;
+ int ret;
+ char buf[17];
+
+ while (1) {
+ ret = read(fd, &e, sizeof(e));
+ if (ret == 0)
+ break;
+ if (ret != sizeof(e)) {
+ perror("Can't read pipe entry");
+ return 1;
+ }
+
+ printf("%d: %lx %o %d ", e.fd, e.pipeid, e.flags, e.bytes);
+ if (e.flags & O_WRONLY) {
+ printf("\n");
+
+ if (e.bytes) {
+ printf("Bogus pipe\n");
+ return 1;
+ }
+
+ continue;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ ret = e.bytes;
+ if (ret > 16)
+ ret = 16;
+
+ read(fd, buf, ret);
+ printf("\t[%s", buf);
+ if (ret < e.bytes)
+ printf("...");
+ printf("]\n");
+ lseek(fd, e.bytes - ret, SEEK_CUR);
+ }
+
+ return 0;
+
+}
+
+int main(int argc, char **argv)
+{
+ __u32 type;
+ int fd;
+
+ fd = open(argv[1], O_RDONLY);
+ if (fd < 0) {
+ perror("Can't open");
+ return 1;
+ }
+
+ read(fd, &type, 4);
+
+ if (type == FDINFO_MAGIC)
+ return show_fdinfo(fd);
+ if (type == PAGES_MAGIC)
+ return show_pages(fd);
+ if (type == SHMEM_MAGIC)
+ return show_shmem(fd);
+ if (type == PSTREE_MAGIC)
+ return show_pstree(fd);
+ if (type == PIPES_MAGIC)
+ return show_pipes(fd);
+ if (type == BINFMT_IMG_MAGIC)
+ return show_core(fd);
+
+ printf("Unknown file type 0x%x\n", type);
+ return 1;
+}
diff --git a/xemul/img_structs.h b/xemul/img_structs.h
new file mode 100644
index 000000000..9e52d5da6
--- /dev/null
+++ b/xemul/img_structs.h
@@ -0,0 +1,39 @@
+
+#define FDINFO_MAGIC 0x01010101
+
+struct fdinfo_entry {
+ __u8 type;
+ __u8 len;
+ __u16 flags;
+ __u32 pos;
+ __u64 addr;
+};
+
+#define FDINFO_FD 1
+#define FDINFO_MAP 2
+
+#define PAGES_MAGIC 0x20202020
+
+#define SHMEM_MAGIC 0x03300330
+
+struct shmem_entry {
+ __u64 start;
+ __u64 end;
+ __u64 shmid;
+};
+
+#define PSTREE_MAGIC 0x40044004
+
+struct pstree_entry {
+ __u32 pid;
+ __u32 nr_children;
+};
+
+#define PIPES_MAGIC 0x05055050
+
+struct pipes_entry {
+ __u32 fd;
+ __u32 pipeid;
+ __u32 flags;
+ __u32 bytes;
+};
diff --git a/xemul/readme b/xemul/readme
new file mode 100644
index 000000000..7a7c1c3e1
--- /dev/null
+++ b/xemul/readme
@@ -0,0 +1,2 @@
+Previous version of C/R -- uses in-kernel dumper restorer.
+It's here for the reference and not used by crtools itself.