Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/compel
diff options
context:
space:
mode:
authorCyrill Gorcunov <gorcunov@openvz.org>2016-10-31 15:06:48 +0300
committerAndrei Vagin <avagin@virtuozzo.com>2017-03-15 00:06:06 +0300
commitcbe24fb9c438f147fdd70c41f2a66844457ffed4 (patch)
tree375b747a15aa530d377e4be25ecfb42374d8dd61 /compel
parent51092282de6daca5cc8927c88b52e58314d40c00 (diff)
compel: Move in parasite engine
This is the final patch in the series. It does a bunch of renames and fixes headers respectively. Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
Diffstat (limited to 'compel')
-rw-r--r--compel/Makefile5
-rw-r--r--compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h (renamed from compel/arch/aarch64/src/lib/include/ptrace.h)4
-rw-r--r--compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h57
-rw-r--r--compel/arch/aarch64/src/lib/infect.c111
-rw-r--r--compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h (renamed from compel/arch/arm/src/lib/include/ptrace.h)4
-rw-r--r--compel/arch/arm/src/lib/include/uapi/asm/infect-types.h91
-rw-r--r--compel/arch/arm/src/lib/include/uapi/asm/sigframe.h2
-rw-r--r--compel/arch/arm/src/lib/infect.c122
-rw-r--r--compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h (renamed from compel/arch/ppc64/src/lib/include/ptrace.h)4
-rw-r--r--compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h110
-rw-r--r--compel/arch/ppc64/src/lib/infect.c318
-rw-r--r--compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h (renamed from compel/arch/x86/src/lib/include/ptrace.h)4
-rw-r--r--compel/arch/x86/src/lib/include/uapi/asm/infect-types.h159
-rw-r--r--compel/arch/x86/src/lib/infect.c351
-rw-r--r--compel/include/infect-priv.h59
-rw-r--r--compel/include/rpc-pie-priv.h48
-rw-r--r--compel/include/uapi/compel.h6
-rw-r--r--compel/include/uapi/infect-rpc.h17
-rw-r--r--compel/include/uapi/infect-util.h5
-rw-r--r--compel/include/uapi/infect.h147
-rw-r--r--compel/include/uapi/ptrace.h76
-rw-r--r--compel/src/lib/infect-rpc.c101
-rw-r--r--compel/src/lib/infect-util.c21
-rw-r--r--compel/src/lib/infect.c1262
-rw-r--r--compel/src/lib/ptrace.c100
25 files changed, 3176 insertions, 8 deletions
diff --git a/compel/Makefile b/compel/Makefile
index d421bc078..ad98e9d6d 100644
--- a/compel/Makefile
+++ b/compel/Makefile
@@ -23,6 +23,11 @@ lib-y += src/lib/log.o
host-lib-y += src/lib/log.o
lib-y += arch/$(ARCH)/src/lib/cpu.o
+lib-y += arch/$(ARCH)/src/lib/infect.o
+lib-y += src/lib/infect-rpc.o
+lib-y += src/lib/infect-util.o
+lib-y += src/lib/infect.o
+lib-y += src/lib/ptrace.o
ifeq ($(ARCH),x86)
lib-y += src/lib/handle-elf-32.o
diff --git a/compel/arch/aarch64/src/lib/include/ptrace.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h
index e18454df2..5f090490d 100644
--- a/compel/arch/aarch64/src/lib/include/ptrace.h
+++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h
@@ -1,5 +1,5 @@
-#ifndef __COMPEL_PTRACE_H__
-#define __COMPEL_PTRACE_H__
+#ifndef __COMPEL_BREAKPOINTS_H__
+#define __COMPEL_BREAKPOINTS_H__
#define ARCH_SI_TRAP TRAP_BRKPT
static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h
new file mode 100644
index 000000000..714881c57
--- /dev/null
+++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h
@@ -0,0 +1,57 @@
+#ifndef UAPI_COMPEL_ASM_TYPES_H__
+#define UAPI_COMPEL_ASM_TYPES_H__
+
+#include <stdint.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <asm/ptrace.h>
+#include "common/page.h"
+
+#define SIGMAX 64
+#define SIGMAX_OLD 31
+
+/*
+ * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h
+ *
+ * A thread ARM CPU context
+ */
+
+typedef struct user_pt_regs user_regs_struct_t;
+typedef struct user_fpsimd_state user_fpregs_struct_t;
+
+#define REG_RES(r) ((uint64_t)(r).regs[0])
+#define REG_IP(r) ((uint64_t)(r).pc)
+#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8])
+
+#define user_regs_native(pregs) true
+
+/*
+ * Range for task size calculated from the following Linux kernel files:
+ * arch/arm64/include/asm/memory.h
+ * arch/arm64/Kconfig
+ *
+ * TODO: handle 32 bit tasks
+ */
+#define TASK_SIZE_MIN (1UL << 39)
+#define TASK_SIZE_MAX (1UL << 48)
+
+static inline unsigned long task_size(void)
+{
+ unsigned long task_size;
+
+ for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1)
+ if (munmap((void *)task_size, page_size()))
+ break;
+ return task_size;
+}
+
+#define AT_VECTOR_SIZE 40
+
+typedef uint64_t auxv_t;
+typedef uint64_t tls_t;
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+#define __NR(syscall, compat) __NR_##syscall
+
+#endif /* UAPI_COMPEL_ASM_TYPES_H__ */
diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c
new file mode 100644
index 000000000..fceea2816
--- /dev/null
+++ b/compel/arch/aarch64/src/lib/infect.c
@@ -0,0 +1,111 @@
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <linux/elf.h>
+#include <compel/plugins/std/syscall-codes.h>
+#include "uapi/compel/asm/infect-types.h"
+#include "log.h"
+#include "errno.h"
+#include "infect.h"
+#include "infect-priv.h"
+
+/*
+ * Injected syscall instruction
+ */
+const char code_syscall[] = {
+ 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */
+ 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */
+};
+
+static const int
+code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long));
+
+static inline void __always_unused __check_code_syscall(void)
+{
+ BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg)
+{
+ struct iovec iov;
+ user_fpregs_struct_t fpsimd;
+ int ret;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(user_regs_struct_t);
+ if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) {
+ pr_perror("Failed to obtain CPU registers for %d", pid);
+ goto err;
+ }
+
+ iov.iov_base = &fpsimd;
+ iov.iov_len = sizeof(fpsimd);
+ if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) {
+ pr_perror("Failed to obtain FPU registers for %d", pid);
+ goto err;
+ }
+
+ ret = save(arg, &regs, &fpsimd);
+err:
+ return ret;
+}
+
+int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.regs[8] = (unsigned long)nr;
+ regs.regs[0] = arg1;
+ regs.regs[1] = arg2;
+ regs.regs[2] = arg3;
+ regs.regs[3] = arg4;
+ regs.regs[4] = arg5;
+ regs.regs[5] = arg6;
+ regs.regs[6] = 0;
+ regs.regs[7] = 0;
+
+ err = compel_execute_syscall(ctl, &regs, code_syscall);
+
+ *ret = regs.regs[0];
+ return err;
+}
+
+void *remote_mmap(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map;
+ int err;
+
+ err = compel_syscall(ctl, __NR_mmap, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0 || (long)map < 0)
+ map = 0;
+
+ return (void *)map;
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ regs->pc = new_ip;
+ if (stack)
+ regs->sp = (unsigned long)stack;
+}
+
+bool arch_can_dump_task(struct parasite_ctl *ctl)
+{
+ /*
+ * TODO: Add proper check here
+ */
+ return true;
+}
diff --git a/compel/arch/arm/src/lib/include/ptrace.h b/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h
index e18454df2..5f090490d 100644
--- a/compel/arch/arm/src/lib/include/ptrace.h
+++ b/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h
@@ -1,5 +1,5 @@
-#ifndef __COMPEL_PTRACE_H__
-#define __COMPEL_PTRACE_H__
+#ifndef __COMPEL_BREAKPOINTS_H__
+#define __COMPEL_BREAKPOINTS_H__
#define ARCH_SI_TRAP TRAP_BRKPT
static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h
new file mode 100644
index 000000000..9c2092e5d
--- /dev/null
+++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h
@@ -0,0 +1,91 @@
+#ifndef UAPI_COMPEL_ASM_TYPES_H__
+#define UAPI_COMPEL_ASM_TYPES_H__
+
+#include <stdint.h>
+#include <sys/mman.h>
+#include "common/page.h"
+
+#define SIGMAX 64
+#define SIGMAX_OLD 31
+
+/*
+ * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h
+ *
+ * A thread ARM CPU context
+ */
+
+typedef struct {
+ long uregs[18];
+} user_regs_struct_t;
+
+typedef struct user_vfp user_fpregs_struct_t;
+
+#define ARM_cpsr uregs[16]
+#define ARM_pc uregs[15]
+#define ARM_lr uregs[14]
+#define ARM_sp uregs[13]
+#define ARM_ip uregs[12]
+#define ARM_fp uregs[11]
+#define ARM_r10 uregs[10]
+#define ARM_r9 uregs[9]
+#define ARM_r8 uregs[8]
+#define ARM_r7 uregs[7]
+#define ARM_r6 uregs[6]
+#define ARM_r5 uregs[5]
+#define ARM_r4 uregs[4]
+#define ARM_r3 uregs[3]
+#define ARM_r2 uregs[2]
+#define ARM_r1 uregs[1]
+#define ARM_r0 uregs[0]
+#define ARM_ORIG_r0 uregs[17]
+
+
+/* Copied from arch/arm/include/asm/user.h */
+
+struct user_vfp {
+ unsigned long long fpregs[32];
+ unsigned long fpscr;
+};
+
+struct user_vfp_exc {
+ unsigned long fpexc;
+ unsigned long fpinst;
+ unsigned long fpinst2;
+};
+
+#define REG_RES(regs) ((regs).ARM_r0)
+#define REG_IP(regs) ((regs).ARM_pc)
+#define REG_SYSCALL_NR(regs) ((regs).ARM_r7)
+
+#define user_regs_native(pregs) true
+
+/*
+ * Range for task size calculated from the following Linux kernel files:
+ * arch/arm/include/asm/memory.h
+ * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section)
+ */
+#define TASK_SIZE_MIN 0x3f000000
+#define TASK_SIZE_MAX 0xbf000000
+#define SZ_1G 0x40000000
+
+static inline unsigned long task_size(void)
+{
+ unsigned long task_size;
+
+ for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G)
+ if (munmap((void *)task_size, page_size()))
+ break;
+
+ return task_size;
+}
+
+#define AT_VECTOR_SIZE 40
+
+typedef uint32_t auxv_t;
+typedef uint32_t tls_t;
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+#define __NR(syscall, compat) __NR_##syscall
+
+#endif /* UAPI_COMPEL_ASM_TYPES_H__ */
diff --git a/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h b/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h
index 65ae8a8b9..3e7bc0104 100644
--- a/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h
+++ b/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h
@@ -1,6 +1,8 @@
#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__
#define UAPI_COMPEL_ASM_SIGFRAME_H__
+#include <compel/asm/infect-types.h>
+
/* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */
struct rt_sigcontext {
diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c
new file mode 100644
index 000000000..b440ff736
--- /dev/null
+++ b/compel/arch/arm/src/lib/infect.c
@@ -0,0 +1,122 @@
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <compel/plugins/std/syscall-codes.h>
+#include <compel/asm/processor-flags.h>
+#include "uapi/compel/asm/infect-types.h"
+#include "log.h"
+#include "errno.h"
+#include "infect.h"
+#include "infect-priv.h"
+
+/*
+ * Injected syscall instruction
+ */
+const char code_syscall[] = {
+ 0x00, 0x00, 0x00, 0xef, /* SVC #0 */
+ 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */
+};
+
+static const int
+code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long));
+
+static inline __always_unused void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+#define PTRACE_GETVFPREGS 27
+int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg)
+{
+ user_fpregs_struct_t vfp;
+ int ret = -1;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) {
+ pr_perror("Can't obtain FPU registers for %d", pid);
+ goto err;
+ }
+
+ /* Did we come from a system call? */
+ if ((int)regs.ARM_ORIG_r0 >= 0) {
+ /* Restart the system call */
+ switch ((long)(int)regs.ARM_r0) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ regs.ARM_r0 = regs.ARM_ORIG_r0;
+ regs.ARM_pc -= 4;
+ break;
+ case -ERESTART_RESTARTBLOCK:
+ regs.ARM_r0 = __NR_restart_syscall;
+ regs.ARM_pc -= 4;
+ break;
+ }
+ }
+
+ ret = save(arg, &regs, &vfp);
+err:
+ return ret;
+}
+
+int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.ARM_r7 = (unsigned long)nr;
+ regs.ARM_r0 = arg1;
+ regs.ARM_r1 = arg2;
+ regs.ARM_r2 = arg3;
+ regs.ARM_r3 = arg4;
+ regs.ARM_r4 = arg5;
+ regs.ARM_r5 = arg6;
+
+ err = compel_execute_syscall(ctl, &regs, code_syscall);
+
+ *ret = regs.ARM_r0;
+ return err;
+}
+
+void *remote_mmap(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map;
+ int err;
+
+ if (offset & ~PAGE_MASK)
+ return 0;
+
+ err = compel_syscall(ctl, __NR_mmap2, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset >> 12);
+ if (err < 0 || map > ctl->ictx.task_size)
+ map = 0;
+
+ return (void *)map;
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ regs->ARM_pc = new_ip;
+ if (stack)
+ regs->ARM_sp = (unsigned long)stack;
+
+ /* Make sure flags are in known state */
+ regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT;
+}
+
+bool arch_can_dump_task(struct parasite_ctl *ctl)
+{
+ /*
+ * TODO: Add proper check here
+ */
+ return true;
+}
diff --git a/compel/arch/ppc64/src/lib/include/ptrace.h b/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h
index 0274c2675..1ab89af76 100644
--- a/compel/arch/ppc64/src/lib/include/ptrace.h
+++ b/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h
@@ -1,5 +1,5 @@
-#ifndef __COMPEL_PTRACE_H__
-#define __COMPEL_PTRACE_H__
+#ifndef __COMPEL_BREAKPOINTS_H__
+#define __COMPEL_BREAKPOINTS_H__
#define ARCH_SI_TRAP TRAP_BRKPT
static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h
new file mode 100644
index 000000000..f243def73
--- /dev/null
+++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h
@@ -0,0 +1,110 @@
+#ifndef UAPI_COMPEL_ASM_TYPES_H__
+#define UAPI_COMPEL_ASM_TYPES_H__
+
+#include <stdbool.h>
+#include <signal.h>
+#include <stdint.h>
+
+#define SIGMAX_OLD 31
+#define SIGMAX 64
+
+/*
+ * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h
+ */
+typedef struct {
+ unsigned long gpr[32];
+ unsigned long nip;
+ unsigned long msr;
+ unsigned long orig_gpr3; /* Used for restarting system calls */
+ unsigned long ctr;
+ unsigned long link;
+ unsigned long xer;
+ unsigned long ccr;
+ unsigned long softe; /* Soft enabled/disabled */
+ unsigned long trap; /* Reason for being here */
+ /*
+ * N.B. for critical exceptions on 4xx, the dar and dsisr
+ * fields are overloaded to hold srr0 and srr1.
+ */
+ unsigned long dar; /* Fault registers */
+ unsigned long dsisr; /* on 4xx/Book-E used for ESR */
+ unsigned long result; /* Result of a system call */
+} user_regs_struct_t;
+
+#define NVSXREG 32
+
+#define USER_FPREGS_FL_FP 0x00001
+#define USER_FPREGS_FL_ALTIVEC 0x00002
+#define USER_FPREGS_FL_VSX 0x00004
+#define USER_FPREGS_FL_TM 0x00010
+
+#ifndef NT_PPC_TM_SPR
+# define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */
+# define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */
+# define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */
+# define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */
+# define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */
+#endif
+
+#define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */
+#define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */
+#define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */
+#define MSR_VEC (1UL<<25)
+#define MSR_VSX (1UL<<23)
+
+#define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0)
+
+typedef struct {
+ uint64_t fpregs[NFPREG];
+ __vector128 vrregs[NVRREG];
+ uint64_t vsxregs[NVSXREG];
+
+ int flags;
+ struct tm_regs {
+ int flags;
+ struct {
+ uint64_t tfhar, texasr, tfiar;
+ } tm_spr_regs;
+ user_regs_struct_t regs;
+ uint64_t fpregs[NFPREG];
+ __vector128 vrregs[NVRREG];
+ uint64_t vsxregs[NVSXREG];
+ } tm;
+} user_fpregs_struct_t;
+
+#define REG_RES(regs) ((uint64_t)(regs).gpr[3])
+#define REG_IP(regs) ((uint64_t)(regs).nip)
+#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0])
+
+#define user_regs_native(pregs) true
+
+/*
+ * Copied from the following kernel header files :
+ * include/linux/auxvec.h
+ * arch/powerpc/include/uapi/asm/auxvec.h
+ * include/linux/mm_types.h
+ */
+#define AT_VECTOR_SIZE_BASE 20
+#define AT_VECTOR_SIZE_ARCH 6
+#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
+
+typedef uint64_t auxv_t;
+
+/* Not used but the structure parasite_dump_thread needs a tls_t field */
+typedef uint64_t tls_t;
+
+/*
+ * Copied for the Linux kernel arch/powerpc/include/asm/processor.h
+ *
+ * NOTE: 32bit tasks are not supported.
+ */
+#define TASK_SIZE_USER64 (0x0000400000000000UL)
+#define TASK_SIZE TASK_SIZE_USER64
+
+static inline unsigned long task_size(void) { return TASK_SIZE; }
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+#define __NR(syscall, compat) __NR_##syscall
+
+#endif /* UAPI_COMPEL_ASM_TYPES_H__ */
diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c
new file mode 100644
index 000000000..959098b8c
--- /dev/null
+++ b/compel/arch/ppc64/src/lib/infect.c
@@ -0,0 +1,318 @@
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <stdint.h>
+#include <errno.h>
+#include <compel/plugins/std/syscall-codes.h>
+#include "uapi/compel/asm/infect-types.h"
+#include "errno.h"
+#include "log.h"
+#include "common/bug.h"
+#include "infect.h"
+#include "infect-priv.h"
+
+#ifndef NT_PPC_TM_SPR
+#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */
+#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */
+#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */
+#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */
+#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */
+#endif
+
+/*
+ * Injected syscall instruction
+ */
+const uint32_t code_syscall[] = {
+ 0x44000002, /* sc */
+ 0x0fe00000 /* twi 31,0,0 */
+};
+
+static inline void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+/* This is the layout of the POWER7 VSX registers and the way they
+ * overlap with the existing FPR and VMX registers.
+ *
+ * VSR doubleword 0 VSR doubleword 1
+ * ----------------------------------------------------------------
+ * VSR[0] | FPR[0] | |
+ * ----------------------------------------------------------------
+ * VSR[1] | FPR[1] | |
+ * ----------------------------------------------------------------
+ * | ... | |
+ * ----------------------------------------------------------------
+ * VSR[30] | FPR[30] | |
+ * ----------------------------------------------------------------
+ * VSR[31] | FPR[31] | |
+ * ----------------------------------------------------------------
+ * VSR[32] | VR[0] |
+ * ----------------------------------------------------------------
+ * VSR[33] | VR[1] |
+ * ----------------------------------------------------------------
+ * | ... |
+ * ----------------------------------------------------------------
+ * VSR[62] | VR[30] |
+ * ----------------------------------------------------------------
+ * VSR[63] | VR[31] |
+ * ----------------------------------------------------------------
+ *
+ * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR
+ * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE
+ * PTRACE_GETVSRREGS returns VSR[0..31]
+ *
+ * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need
+ * to save FPSCR too.
+ *
+ * There 32 VSX double word registers to save since the 32 first VSX double
+ * word registers are saved through FPR[0..32] and the remaining registers
+ * are saved when saving the Altivec registers VR[0..32].
+ */
+
+static int get_fpu_regs(pid_t pid, user_fpregs_struct_t *fp)
+{
+ if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fp->fpregs) < 0) {
+ pr_perror("Couldn't get floating-point registers");
+ return -1;
+ }
+ fp->flags |= USER_FPREGS_FL_FP;
+
+ return 0;
+}
+
+static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp)
+{
+ if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&fp->vrregs) < 0) {
+ /* PTRACE_GETVRREGS returns EIO if Altivec is not supported.
+ * This should not happen if msr_vec is set. */
+ if (errno != EIO) {
+ pr_perror("Couldn't get Altivec registers");
+ return -1;
+ }
+ pr_debug("Altivec not supported\n");
+ }
+ else {
+ pr_debug("Dumping Altivec registers\n");
+ fp->flags |= USER_FPREGS_FL_ALTIVEC;
+ }
+ return 0;
+}
+
+/*
+ * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and
+ * FPR are saved through the FP state, there is no need to save the upper part
+ * of the first 32 VSX registers.
+ * Furthermore, the 32 last VSX registers are also the 32 Altivec registers
+ * already saved, so no need to save them.
+ * As a consequence, only the doubleword 1 of the 32 first VSX registers have
+ * to be saved (the ones are returned by PTRACE_GETVSRREGS).
+ */
+static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp)
+{
+ if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)fp->vsxregs) < 0) {
+ /*
+ * EIO is returned in the case PTRACE_GETVRREGS is not
+ * supported.
+ */
+ if (errno != EIO) {
+ pr_perror("Couldn't get VSX registers");
+ return -1;
+ }
+ pr_debug("VSX register's dump not supported.\n");
+ }
+ else {
+ pr_debug("Dumping VSX registers\n");
+ fp->flags |= USER_FPREGS_FL_VSX;
+ }
+ return 0;
+}
+
+static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs)
+{
+ struct iovec iov;
+
+ pr_debug("Dumping TM registers\n");
+
+#define TM_REQUIRED 0
+#define TM_OPTIONAL 1
+#define PTRACE_GET_TM(s,n,c,u) do { \
+ iov.iov_base = &s; \
+ iov.iov_len = sizeof(s); \
+ if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \
+ if (!u || errno != EIO) { \
+ pr_perror("Couldn't get TM "n); \
+ pr_err("Your kernel seems to not support the " \
+ "new TM ptrace API (>= 4.8)\n"); \
+ goto out_free; \
+ } \
+ pr_debug("TM "n" not supported.\n"); \
+ iov.iov_base = NULL; \
+ } \
+} while(0)
+
+ /* Get special registers */
+ PTRACE_GET_TM(fpregs->tm.tm_spr_regs, "SPR", NT_PPC_TM_SPR, TM_REQUIRED);
+
+ /* Get checkpointed regular registers */
+ PTRACE_GET_TM(fpregs->tm.regs, "GPR", NT_PPC_TM_CGPR, TM_REQUIRED);
+
+ /* Get checkpointed FP registers */
+ PTRACE_GET_TM(fpregs->tm.fpregs, "FPR", NT_PPC_TM_CFPR, TM_OPTIONAL);
+ if (iov.iov_base)
+ fpregs->tm.flags |= USER_FPREGS_FL_FP;
+
+ /* Get checkpointed VMX (Altivec) registers */
+ PTRACE_GET_TM(fpregs->tm.vrregs, "VMX", NT_PPC_TM_CVMX, TM_OPTIONAL);
+ if (iov.iov_base)
+ fpregs->tm.flags |= USER_FPREGS_FL_ALTIVEC;
+
+ /* Get checkpointed VSX registers */
+ PTRACE_GET_TM(fpregs->tm.vsxregs, "VSX", NT_PPC_TM_CVSX, TM_OPTIONAL);
+ if (iov.iov_base)
+ fpregs->tm.flags |= USER_FPREGS_FL_VSX;
+
+ return 0;
+
+out_free:
+ return -1; /* still failing the checkpoint */
+}
+
+static int __get_task_regs(pid_t pid, user_regs_struct_t *regs,
+ user_fpregs_struct_t *fpregs)
+{
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ /*
+ * This is inspired by kernel function check_syscall_restart in
+ * arch/powerpc/kernel/signal.c
+ */
+#ifndef TRAP
+#define TRAP(r) ((r).trap & ~0xF)
+#endif
+
+ if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) {
+ /* Restart the system call */
+ switch (regs->gpr[3]) {
+ case ERESTARTNOHAND:
+ case ERESTARTSYS:
+ case ERESTARTNOINTR:
+ regs->gpr[3] = regs->orig_gpr3;
+ regs->nip -= 4;
+ break;
+ case ERESTART_RESTARTBLOCK:
+ regs->gpr[0] = __NR_restart_syscall;
+ regs->nip -= 4;
+ break;
+ }
+ }
+
+ /* Resetting trap since we are now coming from user space. */
+ regs->trap = 0;
+
+ fpregs->flags = 0;
+ /*
+ * Check for Transactional Memory operation in progress.
+ * Until we have support of TM register's state through the ptrace API,
+ * we can't checkpoint process with TM operation in progress (almost
+ * impossible) or suspended (easy to get).
+ */
+ if (MSR_TM_ACTIVE(regs->msr)) {
+ pr_debug("Task %d has %s TM operation at 0x%lx\n",
+ pid,
+ (regs->msr & MSR_TMS) ? "a suspended" : "an active",
+ regs->nip);
+ if (get_tm_regs(pid, fpregs))
+ return -1;
+ fpregs->flags = USER_FPREGS_FL_TM;
+ }
+
+ if (get_fpu_regs(pid, fpregs))
+ return -1;
+
+ if (get_altivec_regs(pid, fpregs))
+ return -1;
+
+ if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) {
+ /*
+ * Save the VSX registers if Altivec registers are supported
+ */
+ if (get_vsx_regs(pid, fpregs))
+ return -1;
+ }
+ return 0;
+}
+
+int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg)
+{
+ user_fpregs_struct_t fpregs;
+ int ret;
+
+ ret = __get_task_regs(pid, &regs, &fpregs);
+ if (ret)
+ return ret;
+
+ return save(arg, &regs, &fpregs);
+}
+
+int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.gpr[0] = (unsigned long)nr;
+ regs.gpr[3] = arg1;
+ regs.gpr[4] = arg2;
+ regs.gpr[5] = arg3;
+ regs.gpr[6] = arg4;
+ regs.gpr[7] = arg5;
+ regs.gpr[8] = arg6;
+
+ err = compel_execute_syscall(ctl, &regs, (char*)code_syscall);
+
+ *ret = regs.gpr[3];
+ return err;
+}
+
+void *remote_mmap(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map = 0;
+ int err;
+
+ err = compel_syscall(ctl, __NR_mmap, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0 || (long)map < 0)
+ map = 0;
+
+ return (void *)map;
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ /*
+ * OpenPOWER ABI requires that r12 is set to the calling function addressi
+ * to compute the TOC pointer.
+ */
+ regs->gpr[12] = new_ip;
+ regs->nip = new_ip;
+ if (stack)
+ regs->gpr[1] = (unsigned long) stack;
+ regs->trap = 0;
+}
+
+bool arch_can_dump_task(struct parasite_ctl *ctl)
+{
+ /*
+ * TODO: We should detect 32bit task when BE support is done.
+ */
+ return true;
+}
diff --git a/compel/arch/x86/src/lib/include/ptrace.h b/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h
index 844ea0efd..980f25d06 100644
--- a/compel/arch/x86/src/lib/include/ptrace.h
+++ b/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h
@@ -1,5 +1,5 @@
-#ifndef __COMPEL_PTRACE_H__
-#define __COMPEL_PTRACE_H__
+#ifndef __COMPEL_BREAKPOINTS_H__
+#define __COMPEL_BREAKPOINTS_H__
#define ARCH_SI_TRAP SI_KERNEL
extern int ptrace_set_breakpoint(pid_t pid, void *addr);
extern int ptrace_flush_breakpoints(pid_t pid);
diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h
new file mode 100644
index 000000000..bbc6bcf22
--- /dev/null
+++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h
@@ -0,0 +1,159 @@
+#ifndef UAPI_COMPEL_ASM_TYPES_H__
+#define UAPI_COMPEL_ASM_TYPES_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include "log.h"
+#include "common/bug.h"
+#include "common/page.h"
+#include <compel/plugins/std/asm/syscall-types.h>
+
+#define SIGMAX 64
+#define SIGMAX_OLD 31
+
+typedef struct {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ uint64_t bp;
+ uint64_t bx;
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ uint64_t ax;
+ uint64_t cx;
+ uint64_t dx;
+ uint64_t si;
+ uint64_t di;
+ uint64_t orig_ax;
+ uint64_t ip;
+ uint64_t cs;
+ uint64_t flags;
+ uint64_t sp;
+ uint64_t ss;
+ uint64_t fs_base;
+ uint64_t gs_base;
+ uint64_t ds;
+ uint64_t es;
+ uint64_t fs;
+ uint64_t gs;
+} user_regs_struct64;
+
+typedef struct {
+ uint32_t bx;
+ uint32_t cx;
+ uint32_t dx;
+ uint32_t si;
+ uint32_t di;
+ uint32_t bp;
+ uint32_t ax;
+ uint32_t ds;
+ uint32_t es;
+ uint32_t fs;
+ uint32_t gs;
+ uint32_t orig_ax;
+ uint32_t ip;
+ uint32_t cs;
+ uint32_t flags;
+ uint32_t sp;
+ uint32_t ss;
+} user_regs_struct32;
+
+#ifdef CONFIG_X86_64
+/*
+ * To be sure that we rely on inited reg->__is_native, this member
+ * is (short int) instead of initial (bool). The right way to
+ * check if regs are native or compat is to use user_regs_native() macro.
+ * This should cost nothing, as *usually* sizeof(bool) == sizeof(short)
+ */
+typedef struct {
+ union {
+ user_regs_struct64 native;
+ user_regs_struct32 compat;
+ };
+ short __is_native; /* use user_regs_native macro to check it */
+} user_regs_struct_t;
+
+#define NATIVE_MAGIC 0x0A
+#define COMPAT_MAGIC 0x0C
+static inline bool user_regs_native(user_regs_struct_t *pregs)
+{
+ return pregs->__is_native == NATIVE_MAGIC;
+}
+
+#define get_user_reg(pregs, name) \
+ ((user_regs_native(pregs)) ? \
+ ((pregs)->native.name) : \
+ ((pregs)->compat.name))
+
+#define set_user_reg(pregs, name, val) \
+ ((user_regs_native(pregs)) ? \
+ ((pregs)->native.name = (val)) : \
+ ((pregs)->compat.name = (val)))
+#else
+typedef struct {
+ union {
+ user_regs_struct32 native;
+ };
+} user_regs_struct_t;
+#define user_regs_native(pregs) true
+#define get_user_reg(pregs, name) ((pregs)->native.name)
+#define set_user_reg(pregs, name, val) ((pregs)->native.name = val)
+#endif
+
+#if 0
+typedef struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ unsigned short fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ u32 padding[24];
+} user_fpregs_struct_t;
+#endif
+
+typedef struct xsave_struct user_fpregs_struct_t;
+
+#ifdef CONFIG_X86_64
+# define TASK_SIZE ((1UL << 47) - PAGE_SIZE)
+#else
+/*
+ * Task size may be limited to 3G but we need a
+ * higher limit, because it's backward compatible.
+ */
+# define TASK_SIZE (0xffffe000)
+#endif
+
+static inline unsigned long task_size(void) { return TASK_SIZE; }
+
+typedef uint64_t auxv_t;
+
+/*
+ * Linux preserves three TLS segments in GDT.
+ * Offsets in GDT differ between 32-bit and 64-bit machines.
+ * For 64-bit x86 those GDT offsets are the same
+ * for native and compat tasks.
+ */
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+#define GDT_ENTRY_TLS_NUM 3
+typedef struct {
+ user_desc_t desc[GDT_ENTRY_TLS_NUM];
+} tls_t;
+
+#define REG_RES(regs) get_user_reg(&regs, ax)
+#define REG_IP(regs) get_user_reg(&regs, ip)
+#define REG_SYSCALL_NR(regs) get_user_reg(&regs, orig_ax)
+
+#define AT_VECTOR_SIZE 44
+
+#endif /* UAPI_COMPEL_ASM_TYPES_H__ */
diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c
new file mode 100644
index 000000000..53cae1dc5
--- /dev/null
+++ b/compel/arch/x86/src/lib/infect.c
@@ -0,0 +1,351 @@
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+
+#include <compel/asm/fpu.h>
+
+#include "asm/cpu.h"
+
+#include <compel/asm/processor-flags.h>
+#include <compel/cpu.h>
+#include "errno.h"
+#include <compel/plugins/std/syscall-codes.h>
+#include <compel/plugins/std/syscall.h>
+#include "asm/ptrace.h"
+#include "common/err.h"
+#include "asm/infect-types.h"
+#include "uapi/compel/ptrace.h"
+#include "infect.h"
+#include "infect-priv.h"
+#include "log.h"
+
+/*
+ * Injected syscall instruction
+ */
+const char code_syscall[] = {
+ 0x0f, 0x05, /* syscall */
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */
+};
+
+const char code_int_80[] = {
+ 0xcd, 0x80, /* int $0x80 */
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */
+};
+
+static const int
+code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long));
+static const int
+code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long));
+
+static inline __always_unused void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(code_int_80_aligned != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+#define get_signed_user_reg(pregs, name) \
+ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : \
+ (int32_t)((pregs)->compat.name))
+
+int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg)
+{
+ user_fpregs_struct_t xsave = { }, *xs = NULL;
+
+ struct iovec iov;
+ int ret = -1;
+
+ pr_info("Dumping general registers for %d in %s mode\n", pid,
+ user_regs_native(&regs) ? "native" : "compat");
+
+ /* Did we come from a system call? */
+ if (get_signed_user_reg(&regs, orig_ax) >= 0) {
+ /* Restart the system call */
+ switch (get_signed_user_reg(&regs, ax)) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ set_user_reg(&regs, ax, get_user_reg(&regs, orig_ax));
+ set_user_reg(&regs, ip, get_user_reg(&regs, ip) - 2);
+ break;
+ case -ERESTART_RESTARTBLOCK:
+ pr_warn("Will restore %d with interrupted system call\n", pid);
+ set_user_reg(&regs, ax, -EINTR);
+ break;
+ }
+ }
+
+#ifndef PTRACE_GETREGSET
+# define PTRACE_GETREGSET 0x4204
+#endif
+
+ if (!cpu_has_feature(X86_FEATURE_FPU))
+ goto out;
+
+ /*
+ * FPU fetched either via fxsave or via xsave,
+ * thus decode it accrodingly.
+ */
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ if (cpu_has_feature(X86_FEATURE_OSXSAVE)) {
+ iov.iov_base = &xsave;
+ iov.iov_len = sizeof(xsave);
+
+ if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) {
+ pr_perror("Can't obtain FPU registers for %d", pid);
+ goto err;
+ }
+ } else {
+ if (ptrace(PTRACE_GETFPREGS, pid, NULL, &xsave)) {
+ pr_perror("Can't obtain FPU registers for %d", pid);
+ goto err;
+ }
+ }
+
+ xs = &xsave;
+out:
+ ret = save(arg, &regs, xs);
+err:
+ return ret;
+}
+
+int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ if (user_regs_native(&regs)) {
+ user_regs_struct64 *r = &regs.native;
+
+ r->ax = (uint64_t)nr;
+ r->di = arg1;
+ r->si = arg2;
+ r->dx = arg3;
+ r->r10 = arg4;
+ r->r8 = arg5;
+ r->r9 = arg6;
+
+ err = compel_execute_syscall(ctl, &regs, code_syscall);
+ } else {
+ user_regs_struct32 *r = &regs.compat;
+
+ r->ax = (uint32_t)nr;
+ r->bx = arg1;
+ r->cx = arg2;
+ r->dx = arg3;
+ r->si = arg4;
+ r->di = arg5;
+ r->bp = arg6;
+
+ err = compel_execute_syscall(ctl, &regs, code_int_80);
+ }
+
+ *ret = get_user_reg(&regs, ax);
+ return err;
+}
+
+void *remote_mmap(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map;
+ int err;
+ bool compat_task = !user_regs_native(&ctl->orig.regs);
+
+ err = compel_syscall(ctl, __NR(mmap, compat_task), &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0)
+ return NULL;
+
+ if (IS_ERR_VALUE(map)) {
+ if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC))
+ pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, "
+ "check selinux execmem policy\n", ctl->rpid);
+ return NULL;
+ }
+
+ return (void *)map;
+}
+
+/*
+ * regs must be inited when calling this function from original context
+ */
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ set_user_reg(regs, ip, new_ip);
+ if (stack)
+ set_user_reg(regs, sp, (unsigned long) stack);
+
+ /* Avoid end of syscall processing */
+ set_user_reg(regs, orig_ax, -1);
+
+ /* Make sure flags are in known state */
+ set_user_reg(regs, flags, get_user_reg(regs, flags) &
+ ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF));
+}
+
+#define USER32_CS 0x23
+#define USER_CS 0x33
+
+static bool ldt_task_selectors(pid_t pid)
+{
+ unsigned long cs;
+
+ errno = 0;
+ /*
+ * Offset of register must be from 64-bit set even for
+ * compatible tasks. Fix this to support native i386 tasks
+ */
+ cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct64, cs), 0);
+ if (errno != 0) {
+ pr_perror("Can't get CS register for %d", pid);
+ return -1;
+ }
+
+ return cs != USER_CS && cs != USER32_CS;
+}
+
+static int arch_task_compatible(pid_t pid)
+{
+ user_regs_struct_t r;
+ int ret = ptrace_get_regs(pid, &r);
+
+ if (ret)
+ return -1;
+
+ return !user_regs_native(&r);
+}
+
+bool arch_can_dump_task(struct parasite_ctl *ctl)
+{
+ pid_t pid = ctl->rpid;
+ int ret;
+
+ ret = arch_task_compatible(pid);
+ if (ret < 0)
+ return false;
+
+ if (ret && !(ctl->ictx.flags & INFECT_HAS_COMPAT_SIGRETURN)) {
+ pr_err("Can't dump task %d running in 32-bit mode\n", pid);
+ return false;
+ }
+
+ if (ldt_task_selectors(pid)) {
+ pr_err("Can't dump task %d with LDT descriptors\n", pid);
+ return false;
+ }
+
+ return true;
+}
+
+/* Copied from the gdb header gdb/nat/x86-dregs.h */
+
+/* Debug registers' indices. */
+#define DR_FIRSTADDR 0
+#define DR_LASTADDR 3
+#define DR_NADDR 4 /* The number of debug address registers. */
+#define DR_STATUS 6 /* Index of debug status register (DR6). */
+#define DR_CONTROL 7 /* Index of debug control register (DR7). */
+
+#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */
+#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */
+#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */
+
+/* Locally enable the break/watchpoint in the I'th debug register. */
+#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i)))
+
+int ptrace_set_breakpoint(pid_t pid, void *addr)
+{
+ int ret;
+
+ /* Set a breakpoint */
+ if (ptrace(PTRACE_POKEUSER, pid,
+ offsetof(struct user, u_debugreg[DR_FIRSTADDR]),
+ addr)) {
+ pr_perror("Unable to setup a breakpoint into %d", pid);
+ return -1;
+ }
+
+ /* Enable the breakpoint */
+ if (ptrace(PTRACE_POKEUSER, pid,
+ offsetof(struct user, u_debugreg[DR_CONTROL]),
+ X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) {
+ pr_perror("Unable to enable the breakpoint for %d", pid);
+ return -1;
+ }
+
+ ret = ptrace(PTRACE_CONT, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("Unable to restart the stopped tracee process %d", pid);
+ return -1;
+ }
+
+ return 1;
+}
+
+int ptrace_flush_breakpoints(pid_t pid)
+{
+ /* Disable the breakpoint */
+ if (ptrace(PTRACE_POKEUSER, pid,
+ offsetof(struct user, u_debugreg[DR_CONTROL]),
+ 0)) {
+ pr_perror("Unable to disable the breakpoint for %d", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs)
+{
+ struct iovec iov;
+ int ret;
+
+ iov.iov_base = &regs->native;
+ iov.iov_len = sizeof(user_regs_struct64);
+
+ ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov);
+ if (ret == -1) {
+ pr_perror("PTRACE_GETREGSET failed");
+ return -1;
+ }
+
+ if (iov.iov_len == sizeof(regs->native)) {
+ regs->__is_native = NATIVE_MAGIC;
+ return ret;
+ }
+ if (iov.iov_len == sizeof(regs->compat)) {
+ regs->__is_native = COMPAT_MAGIC;
+ return ret;
+ }
+
+ pr_err("PTRACE_GETREGSET read %zu bytes for pid %d, but native/compat regs sizes are %zu/%zu bytes",
+ iov.iov_len, pid,
+ sizeof(regs->native), sizeof(regs->compat));
+ return -1;
+}
+
+int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs)
+{
+ struct iovec iov;
+
+ if (user_regs_native(regs)) {
+ iov.iov_base = &regs->native;
+ iov.iov_len = sizeof(user_regs_struct64);
+ } else {
+ iov.iov_base = &regs->compat;
+ iov.iov_len = sizeof(user_regs_struct32);
+ }
+ return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov);
+}
diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h
new file mode 100644
index 000000000..db5259c2c
--- /dev/null
+++ b/compel/include/infect-priv.h
@@ -0,0 +1,59 @@
+#ifndef __COMPEL_INFECT_PRIV_H__
+#define __COMPEL_INFECT_PRIV_H__
+
+#include <stdbool.h>
+
+#define BUILTIN_SYSCALL_SIZE 8
+
+/* parasite control block */
+struct parasite_ctl {
+ int rpid; /* Real pid of the victim */
+ void *remote_map;
+ void *local_map;
+ void *sigreturn_addr; /* A place for the breakpoint */
+ unsigned long map_length;
+
+ struct infect_ctx ictx;
+
+ /* thread leader data */
+ bool daemonized;
+
+ struct thread_ctx orig;
+
+ void *rstack; /* thread leader stack*/
+ struct rt_sigframe *sigframe;
+ struct rt_sigframe *rsigframe; /* address in a parasite */
+
+ void *r_thread_stack; /* stack for non-leader threads */
+
+ unsigned long parasite_ip; /* service routine start ip */
+
+ unsigned int *addr_cmd; /* addr for command */
+ void *addr_args; /* address for arguments */
+ unsigned long args_size;
+ int tsock; /* transport socket for transferring fds */
+
+ struct parasite_blob_desc pblob;
+};
+
+#define MEMFD_FNAME "CRIUMFD"
+#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME)
+
+struct ctl_msg;
+int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m);
+
+/* XXX -- remove with cr-exec.c */
+extern int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size);
+extern int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1, unsigned long arg2,
+ unsigned long arg3, unsigned long arg4,
+ unsigned long arg5, unsigned long arg6);
+
+
+extern void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
+extern void *remote_mmap(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+extern bool arch_can_dump_task(struct parasite_ctl *ctl);
+
+#endif
diff --git a/compel/include/rpc-pie-priv.h b/compel/include/rpc-pie-priv.h
new file mode 100644
index 000000000..3d9091159
--- /dev/null
+++ b/compel/include/rpc-pie-priv.h
@@ -0,0 +1,48 @@
+#ifndef __COMPEL_RPC_H__
+#define __COMPEL_RPC_H__
+struct ctl_msg {
+ uint32_t cmd; /* command itself */
+ uint32_t ack; /* ack on command */
+ int32_t err; /* error code on reply */
+};
+
+#define ctl_msg_cmd(_cmd) \
+ (struct ctl_msg){.cmd = _cmd, }
+
+#define ctl_msg_ack(_cmd, _err) \
+ (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, }
+
+/*
+ * NOTE: each command's args should be arch-independed sized.
+ * If you want to use one of the standard types, declare
+ * alternative type for compatible tasks in parasite-compat.h
+ */
+enum {
+ PARASITE_CMD_IDLE = 0,
+ PARASITE_CMD_ACK,
+
+ PARASITE_CMD_INIT_DAEMON,
+ PARASITE_CMD_UNMAP,
+
+ /*
+ * This must be greater than INITs.
+ */
+ PARASITE_CMD_FINI,
+
+ __PARASITE_END_CMDS,
+};
+
+struct parasite_init_args {
+ int32_t h_addr_len;
+ struct sockaddr_un h_addr;
+ int32_t log_level;
+ uint64_t sigreturn_addr;
+ uint64_t sigframe; /* pointer to sigframe */
+ futex_t daemon_connected;
+};
+
+struct parasite_unmap_args {
+ uint64_t parasite_start;
+ uint64_t parasite_len;
+};
+#endif
diff --git a/compel/include/uapi/compel.h b/compel/include/uapi/compel.h
index 278a85455..3554c1599 100644
--- a/compel/include/uapi/compel.h
+++ b/compel/include/uapi/compel.h
@@ -4,6 +4,8 @@
#include <errno.h>
#include <stdarg.h>
+#include <compel/asm/infect-types.h>
+
#define COMPEL_TYPE_INT (1u << 0)
#define COMPEL_TYPE_LONG (1u << 1)
#define COMPEL_TYPE_GOTPCREL (1u << 2)
@@ -22,4 +24,8 @@ typedef void (*compel_log_fn)(unsigned int lvl, const char *fmt, va_list parms);
extern void compel_log_init(compel_log_fn log_fn, unsigned int level);
extern unsigned int compel_log_get_loglevel(void);
+#include <compel/infect-util.h>
+#include <compel/infect-rpc.h>
+#include <compel/infect.h>
+
#endif /* UAPI_COMPEL_H__ */
diff --git a/compel/include/uapi/infect-rpc.h b/compel/include/uapi/infect-rpc.h
new file mode 100644
index 000000000..0176c1142
--- /dev/null
+++ b/compel/include/uapi/infect-rpc.h
@@ -0,0 +1,17 @@
+#ifndef __COMPEL_INFECT_RPC_H__
+#define __COMPEL_INFECT_RPC_H__
+
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdint.h>
+
+struct parasite_ctl;
+extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl);
+extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl);
+extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl);
+extern int compel_rpc_sock(struct parasite_ctl *ctl);
+
+#define PARASITE_USER_CMDS 64
+
+
+#endif
diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h
new file mode 100644
index 000000000..bd2010c3b
--- /dev/null
+++ b/compel/include/uapi/infect-util.h
@@ -0,0 +1,5 @@
+#ifndef __COMPEL_INFECT_UTIL_H__
+#define __COMPEL_INFECT_UTIL_H__
+struct parasite_ctl;
+extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd);
+#endif
diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h
new file mode 100644
index 000000000..38051f437
--- /dev/null
+++ b/compel/include/uapi/infect.h
@@ -0,0 +1,147 @@
+#ifndef __COMPEL_INFECT_H__
+#define __COMPEL_INFECT_H__
+
+#include <stdbool.h>
+
+#include <compel/asm/sigframe.h>
+#include <compel/asm/infect-types.h>
+#include <compel/ksigset.h>
+#include <compel/compel.h>
+
+#include "common/compiler.h"
+
+#define PARASITE_START_AREA_MIN (4096)
+
+extern int compel_stop_task(int pid);
+
+struct seize_task_status {
+ char state;
+ int ppid;
+ unsigned long long sigpnd;
+ unsigned long long shdpnd;
+ int seccomp_mode;
+};
+
+extern int compel_wait_task(int pid, int ppid,
+ int (*get_status)(int pid, struct seize_task_status *),
+ struct seize_task_status *st);
+extern int compel_unseize_task(pid_t pid, int orig_state, int state);
+
+/*
+ * FIXME -- these should be mapped to pid.h's
+ */
+
+#define TASK_ALIVE 0x1
+#define TASK_DEAD 0x2
+#define TASK_STOPPED 0x3
+#define TASK_ZOMBIE 0x6
+
+struct parasite_ctl;
+struct thread_ctx {
+ k_rtsigset_t sigmask;
+ user_regs_struct_t regs;
+};
+
+extern struct parasite_ctl *compel_prepare(int pid);
+extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size);
+extern int compel_prepare_thread(int pid, struct thread_ctx *ctx);
+
+extern int compel_stop_daemon(struct parasite_ctl *ctl);
+extern int compel_cure_remote(struct parasite_ctl *ctl);
+extern int compel_cure_local(struct parasite_ctl *ctl);
+extern int compel_cure(struct parasite_ctl *ctl);
+
+#define PARASITE_ARG_SIZE_MIN ( 1 << 12)
+
+#define compel_parasite_args(ctl, type) \
+ ({ \
+ void *___ret; \
+ BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \
+ ___ret = compel_parasite_args_p(ctl); \
+ ___ret; \
+ })
+
+extern void *compel_parasite_args_p(struct parasite_ctl *ctl);
+extern void *compel_parasite_args_s(struct parasite_ctl *ctl, int args_size);
+
+extern int compel_execute_syscall(struct parasite_ctl *ctl,
+ user_regs_struct_t *regs, const char *code_syscall);
+extern int compel_run_in_thread(pid_t pid, unsigned int cmd,
+ struct parasite_ctl *ctl,
+ struct thread_ctx *octx);
+
+/*
+ * The PTRACE_SYSCALL will trap task twice -- on
+ * enter into and on exit from syscall. If we trace
+ * a single task, we may skip half of all getregs
+ * calls -- on exit we don't need them.
+ */
+enum trace_flags {
+ TRACE_ALL,
+ TRACE_ENTER,
+ TRACE_EXIT,
+};
+
+extern int compel_stop_on_syscall(int tasks, int sys_nr,
+ int sys_nr_compat, enum trace_flags trace);
+
+extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp);
+
+extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr);
+
+extern int compel_mode_native(struct parasite_ctl *ctl);
+
+extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl);
+
+struct rt_sigframe;
+
+typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...)
+ __attribute__ ((__format__ (__printf__, 3, 4)));
+
+struct infect_ctx {
+ int *p_sock;
+
+ /*
+ * Regs manipulation context.
+ */
+ int (*save_regs)(void *, user_regs_struct_t *, user_fpregs_struct_t *);
+ int (*make_sigframe)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *);
+ void *regs_arg;
+
+ unsigned long task_size;
+ unsigned long syscall_ip; /* entry point of infection */
+ unsigned long flags; /* fine-tune (e.g. faults) */
+
+ void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */
+
+ open_proc_fn open_proc;
+
+ int log_fd; /* fd for parasite code to send messages to */
+};
+
+extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *);
+
+#define INFECT_NO_MEMFD 0x1 /* don't use memfd() */
+#define INFECT_FAIL_CONNECT 0x2 /* make parasite connect() fail */
+#define INFECT_NO_BREAKPOINTS 0x4 /* no breakpoints in pie tracking */
+#define INFECT_HAS_COMPAT_SIGRETURN 0x8
+
+struct parasite_blob_desc {
+ const void *mem;
+ size_t bsize; /* size of the blob */
+ size_t size; /* size of the blob with relocs */
+ unsigned long parasite_ip_off;
+ unsigned long addr_cmd_off;
+ unsigned long addr_arg_off;
+ compel_reloc_t *relocs;
+ unsigned int nr_relocs;
+};
+
+extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *);
+
+typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *);
+extern int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t, void *);
+
+extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs);
+
+#endif
diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h
new file mode 100644
index 000000000..2ab9e1c76
--- /dev/null
+++ b/compel/include/uapi/ptrace.h
@@ -0,0 +1,76 @@
+#ifndef UAPI_COMPEL_PTRACE_H__
+#define UAPI_COMPEL_PTRACE_H__
+
+#include <linux/types.h>
+#include <sys/ptrace.h>
+
+#include <compel/asm/infect-types.h>
+#include <compel/asm/breakpoints.h>
+
+/* some constants for ptrace */
+#ifndef PTRACE_SEIZE
+# define PTRACE_SEIZE 0x4206
+#endif
+
+#ifndef PTRACE_O_SUSPEND_SECCOMP
+# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
+#endif
+
+#ifndef PTRACE_INTERRUPT
+# define PTRACE_INTERRUPT 0x4207
+#endif
+
+#ifndef PTRACE_LISTEN
+#define PTRACE_LISTEN 0x4208
+#endif
+
+#ifndef PTRACE_PEEKSIGINFO
+#define PTRACE_PEEKSIGINFO 0x4209
+
+/* Read signals from a shared (process wide) queue */
+#define PTRACE_PEEKSIGINFO_SHARED (1 << 0)
+#endif
+
+#ifndef PTRACE_GETREGSET
+# define PTRACE_GETREGSET 0x4204
+# define PTRACE_SETREGSET 0x4205
+#endif
+
+#ifndef PTRACE_GETSIGMASK
+# define PTRACE_GETSIGMASK 0x420a
+# define PTRACE_SETSIGMASK 0x420b
+#endif
+
+#ifndef PTRACE_SECCOMP_GET_FILTER
+#define PTRACE_SECCOMP_GET_FILTER 0x420c
+#endif
+
+#define PTRACE_SEIZE_DEVEL 0x80000000
+
+#define PTRACE_EVENT_FORK 1
+#define PTRACE_EVENT_VFORK 2
+#define PTRACE_EVENT_CLONE 3
+#define PTRACE_EVENT_EXEC 4
+#define PTRACE_EVENT_VFORK_DONE 5
+#define PTRACE_EVENT_EXIT 6
+#define PTRACE_EVENT_STOP 128
+
+#define PTRACE_O_TRACESYSGOOD 0x00000001
+#define PTRACE_O_TRACEFORK 0x00000002
+#define PTRACE_O_TRACEVFORK 0x00000004
+#define PTRACE_O_TRACECLONE 0x00000008
+#define PTRACE_O_TRACEEXEC 0x00000010
+#define PTRACE_O_TRACEVFORKDONE 0x00000020
+#define PTRACE_O_TRACEEXIT 0x00000040
+
+#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
+
+extern int suspend_seccomp(pid_t pid);
+extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
+extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
+extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes);
+
+extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs);
+extern int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs);
+
+#endif /* UAPI_COMPEL_PTRACE_H__ */
diff --git a/compel/src/lib/infect-rpc.c b/compel/src/lib/infect-rpc.c
new file mode 100644
index 000000000..265a4ad2f
--- /dev/null
+++ b/compel/src/lib/infect-rpc.c
@@ -0,0 +1,101 @@
+#include "log.h"
+#include "common/bug.h"
+#include "common/xmalloc.h"
+#include "common/lock.h"
+
+#include "infect.h"
+#include "infect-priv.h"
+#include "infect-rpc.h"
+#include "rpc-pie-priv.h"
+
+static int __parasite_send_cmd(int sockfd, struct ctl_msg *m)
+{
+ int ret;
+
+ BUILD_BUG_ON(PARASITE_USER_CMDS < __PARASITE_END_CMDS);
+
+ ret = send(sockfd, m, sizeof(*m), 0);
+ if (ret == -1) {
+ pr_perror("Failed to send command %d to daemon", m->cmd);
+ return -1;
+ } else if (ret != sizeof(*m)) {
+ pr_err("Message to daemon is trimmed (%d/%d)\n",
+ (int)sizeof(*m), ret);
+ return -1;
+ }
+
+ pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err);
+ return 0;
+}
+
+int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m)
+{
+ int ret;
+
+ pr_debug("Wait for ack %d on daemon socket\n", cmd);
+
+ while (1) {
+ memzero(m, sizeof(*m));
+
+ ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL);
+ if (ret == -1) {
+ pr_perror("Failed to read ack");
+ return -1;
+ } else if (ret != sizeof(*m)) {
+ pr_err("Message reply from daemon is trimmed (%d/%d)\n",
+ (int)sizeof(*m), ret);
+ return -1;
+ }
+ pr_debug("Fetched ack: %d %d %d\n",
+ m->cmd, m->ack, m->err);
+
+ if (m->cmd != cmd || m->ack != cmd) {
+ pr_err("Communication error, this is not "
+ "the ack we expected\n");
+ return -1;
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl)
+{
+ struct ctl_msg m;
+
+ if (parasite_wait_ack(ctl->tsock, cmd, &m))
+ return -1;
+
+ if (m.err != 0) {
+ pr_err("Command %d for daemon failed with %d\n",
+ cmd, m.err);
+ return -1;
+ }
+
+ return 0;
+}
+
+int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl)
+{
+ struct ctl_msg m;
+
+ m = ctl_msg_cmd(cmd);
+ return __parasite_send_cmd(ctl->tsock, &m);
+}
+
+int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl)
+{
+ int ret;
+
+ ret = compel_rpc_call(cmd, ctl);
+ if (!ret)
+ ret = compel_rpc_sync(cmd, ctl);
+
+ return ret;
+}
+
+int compel_rpc_sock(struct parasite_ctl *ctl)
+{
+ return ctl->tsock;
+}
diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c
new file mode 100644
index 000000000..99dbee062
--- /dev/null
+++ b/compel/src/lib/infect-util.c
@@ -0,0 +1,21 @@
+#include "log.h"
+#include "common/bug.h"
+#include "common/lock.h"
+
+#include "uapi/compel/plugins/plugin-fds.h"
+
+#include "infect-rpc.h"
+#include "infect-util.h"
+
+int compel_util_send_fd(struct parasite_ctl *ctl, int fd)
+{
+ int sk;
+
+ sk = compel_rpc_sock(ctl);
+ if (send_fd(sk, NULL, 0, fd) < 0) {
+ pr_perror("Can't send file descriptor");
+ return -1;
+ }
+ return 0;
+}
+
diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c
new file mode 100644
index 000000000..6a3724636
--- /dev/null
+++ b/compel/src/lib/infect.c
@@ -0,0 +1,1262 @@
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <linux/seccomp.h>
+
+#include "log.h"
+#include "common/bug.h"
+#include "common/xmalloc.h"
+#include "common/lock.h"
+#include "common/page.h"
+
+#include <compel/plugins/std/syscall-codes.h>
+#include <compel/plugins/std/asm/syscall-types.h>
+#include "asm/ptrace.h"
+#include "uapi/compel/plugins/std/syscall.h"
+#include "asm/infect-types.h"
+#include "asm/sigframe.h"
+#include "infect.h"
+#include "uapi/compel/ptrace.h"
+#include "infect-rpc.h"
+#include "infect-priv.h"
+#include "infect-util.h"
+#include "rpc-pie-priv.h"
+
+#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \
+ (size_t)((struct sockaddr_un *) 0)->sun_path)
+
+#define PARASITE_STACK_SIZE (16 << 10)
+
+#define PTRACE_EVENT_STOP 128
+
+#ifndef SECCOMP_MODE_DISABLED
+#define SECCOMP_MODE_DISABLED 0
+#endif
+
+#ifndef PTRACE_O_SUSPEND_SECCOMP
+# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
+#endif
+
+#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
+
+static inline void close_safe(int *pfd)
+{
+ if (*pfd > -1) {
+ close(*pfd);
+ *pfd = -1;
+ }
+}
+
+int compel_stop_task(int pid)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
+ if (ret) {
+ /*
+ * ptrace API doesn't allow to distinguish
+ * attaching to zombie from other errors.
+ * All errors will be handled in compel_wait_task().
+ */
+ pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno));
+ return ret;
+ }
+
+ /*
+ * If we SEIZE-d the task stop it before going
+ * and reading its stat from proc. Otherwise task
+ * may die _while_ we're doing it and we'll have
+ * inconsistent seize/state pair.
+ *
+ * If task dies after we seize it but before we
+ * do this interrupt, we'll notice it via proc.
+ */
+ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
+ if (ret < 0) {
+ pr_warn("SEIZE %d: can't interrupt task: %s", pid, strerror(errno));
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+ pr_perror("Unable to detach from %d", pid);
+ }
+
+ return ret;
+}
+
+static int skip_sigstop(int pid, int nr_signals)
+{
+ int i, status, ret;
+
+ /*
+ * 1) SIGSTOP is queued, but isn't handled yet:
+ * SGISTOP can't be blocked, so we need to wait when the kernel
+ * handles this signal.
+ *
+ * Otherwise the process will be stopped immediately after
+ * starting it.
+ *
+ * 2) A seized task was stopped:
+ * PTRACE_SEIZE doesn't affect signal or group stop state.
+ * Currently ptrace reported that task is in stopped state.
+ * We need to start task again, and it will be trapped
+ * immediately, because we sent PTRACE_INTERRUPT to it.
+ */
+ for (i = 0; i < nr_signals; i++) {
+ ret = ptrace(PTRACE_CONT, pid, 0, 0);
+ if (ret) {
+ pr_perror("Unable to start process");
+ return -1;
+ }
+
+ ret = wait4(pid, &status, __WALL, NULL);
+ if (ret < 0) {
+ pr_perror("SEIZE %d: can't wait task", pid);
+ return -1;
+ }
+
+ if (!WIFSTOPPED(status)) {
+ pr_err("SEIZE %d: task not stopped after seize\n", pid);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int do_suspend_seccomp(pid_t pid)
+{
+ if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
+ pr_perror("suspending seccomp failed");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * This routine seizes task putting it into a special
+ * state where we can manipulate the task via ptrace
+ * interface, and finally we can detach ptrace out of
+ * of it so the task would not know if it was saddled
+ * up with someone else.
+ */
+int compel_wait_task(int pid, int ppid,
+ int (*get_status)(int pid, struct seize_task_status *),
+ struct seize_task_status *ss)
+{
+ siginfo_t si;
+ int status, nr_sigstop;
+ int ret = 0, ret2, wait_errno = 0;
+
+ /*
+ * It's ugly, but the ptrace API doesn't allow to distinguish
+ * attaching to zombie from other errors. Thus we have to parse
+ * the target's /proc/pid/stat. Sad, but parse whatever else
+ * we might need at that early point.
+ */
+
+try_again:
+
+ ret = wait4(pid, &status, __WALL, NULL);
+ if (ret < 0) {
+ /*
+ * wait4() can expectedly fail only in a first time
+ * if a task is zombie. If we are here from try_again,
+ * this means that we are tracing this task.
+ *
+ * So here we can be only once in this function.
+ */
+ wait_errno = errno;
+ }
+
+ ret2 = get_status(pid, ss);
+ if (ret2)
+ goto err;
+
+ if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) {
+ if (ss->state != 'Z') {
+ if (pid == getpid())
+ pr_err("The criu itself is within dumped tree.\n");
+ else
+ pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n",
+ pid, ss->state, ret, wait_errno);
+ return -1;
+ }
+
+ if (ret < 0)
+ return TASK_ZOMBIE;
+ else
+ return TASK_DEAD;
+ }
+
+ if ((ppid != -1) && (ss->ppid != ppid)) {
+ pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
+ pid, ppid, ss->ppid);
+ goto err;
+ }
+
+ if (!WIFSTOPPED(status)) {
+ pr_err("SEIZE %d: task not stopped after seize\n", pid);
+ goto err;
+ }
+
+ ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si);
+ if (ret < 0) {
+ pr_perror("SEIZE %d: can't read signfo", pid);
+ goto err;
+ }
+
+ if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
+ /*
+ * Kernel notifies us about the task being seized received some
+ * event other than the STOP, i.e. -- a signal. Let the task
+ * handle one and repeat.
+ */
+
+ if (ptrace(PTRACE_CONT, pid, NULL,
+ (void *)(unsigned long)si.si_signo)) {
+ pr_perror("Can't continue signal handling, aborting");
+ goto err;
+ }
+
+ ret = 0;
+ goto try_again;
+ }
+
+ if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && do_suspend_seccomp(pid) < 0)
+ goto err;
+
+ nr_sigstop = 0;
+ if (ss->sigpnd & (1 << (SIGSTOP - 1)))
+ nr_sigstop++;
+ if (ss->shdpnd & (1 << (SIGSTOP - 1)))
+ nr_sigstop++;
+ if (si.si_signo == SIGSTOP)
+ nr_sigstop++;
+
+ if (nr_sigstop) {
+ if (skip_sigstop(pid, nr_sigstop))
+ goto err_stop;
+
+ return TASK_STOPPED;
+ }
+
+ if (si.si_signo == SIGTRAP)
+ return TASK_ALIVE;
+ else {
+ pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo);
+ goto err;
+ }
+
+err_stop:
+ kill(pid, SIGSTOP);
+err:
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+ pr_perror("Unable to detach from %d", pid);
+ return -1;
+}
+
+int compel_unseize_task(pid_t pid, int orig_st, int st)
+{
+ pr_debug("\tUnseizing %d into %d\n", pid, st);
+
+ if (st == TASK_DEAD) {
+ kill(pid, SIGKILL);
+ return 0;
+ } else if (st == TASK_STOPPED) {
+ /*
+ * Task might have had STOP in queue. We detected such
+ * guy as TASK_STOPPED, but cleared signal to run the
+ * parasite code. hus after detach the task will become
+ * running. That said -- STOP everyone regardless of
+ * the initial state.
+ */
+ kill(pid, SIGSTOP);
+ } else if (st == TASK_ALIVE) {
+ /*
+ * Same as in the comment above -- there might be a
+ * task with STOP in queue that would get lost after
+ * detach, so stop it again.
+ */
+ if (orig_st == TASK_STOPPED)
+ kill(pid, SIGSTOP);
+ } else
+ pr_err("Unknown final state %d\n", st);
+
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) {
+ pr_perror("Unable to detach from %d", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int gen_parasite_saddr(struct sockaddr_un *saddr, int key)
+{
+ int sun_len;
+
+ saddr->sun_family = AF_UNIX;
+ snprintf(saddr->sun_path, UNIX_PATH_MAX,
+ "X/crtools-pr-%d", key);
+
+ sun_len = SUN_LEN(saddr);
+ *saddr->sun_path = '\0';
+
+ return sun_len;
+}
+
+static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid,
+ struct parasite_init_args *args)
+{
+ static int ssock = -1;
+
+ pr_info("Putting tsock into pid %d\n", pid);
+ args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid());
+
+ if (ssock == -1) {
+ ssock = *ctl->ictx.p_sock;
+ if (ssock == -1) {
+ pr_err("No socket in ictx\n");
+ goto err;
+ }
+
+ *ctl->ictx.p_sock = -1;
+
+ if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) {
+ pr_perror("Can't bind socket");
+ goto err;
+ }
+
+ if (listen(ssock, 1)) {
+ pr_perror("Can't listen on transport socket");
+ goto err;
+ }
+ }
+
+ /* Check a case when parasite can't initialize a command socket */
+ if (ctl->ictx.flags & INFECT_FAIL_CONNECT)
+ args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid() + 1);
+
+ /*
+ * Set to -1 to prevent any accidental misuse. The
+ * only valid user of it is accept_tsock().
+ */
+ ctl->tsock = -ssock;
+ return 0;
+err:
+ close_safe(&ssock);
+ return -1;
+}
+
+static int setup_child_handler(struct parasite_ctl *ctl)
+{
+ struct sigaction sa = {
+ .sa_sigaction = ctl->ictx.child_handler,
+ .sa_flags = SA_SIGINFO | SA_RESTART,
+ };
+
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGCHLD);
+ if (sigaction(SIGCHLD, &sa, NULL)) {
+ pr_perror("Unable to setup SIGCHLD handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_child_handler()
+{
+ struct sigaction sa = {
+ .sa_handler = SIG_DFL, /* XXX -- should be original? */
+ .sa_flags = SA_SIGINFO | SA_RESTART,
+ };
+
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGCHLD);
+ if (sigaction(SIGCHLD, &sa, NULL)) {
+ pr_perror("Unable to setup SIGCHLD handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack,
+ user_regs_struct_t *regs, struct thread_ctx *octx)
+{
+ k_rtsigset_t block;
+
+ ksigfillset(&block);
+ if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) {
+ pr_perror("Can't block signals for %d", pid);
+ goto err_sig;
+ }
+
+ parasite_setup_regs(ip, stack, regs);
+ if (ptrace_set_regs(pid, regs)) {
+ pr_perror("Can't set registers for %d", pid);
+ goto err_regs;
+ }
+
+ if (ptrace(cmd, pid, NULL, NULL)) {
+ pr_perror("Can't run parasite at %d", pid);
+ goto err_cont;
+ }
+
+ return 0;
+
+err_cont:
+ if (ptrace_set_regs(pid, &octx->regs))
+ pr_perror("Can't restore regs for %d", pid);
+err_regs:
+ if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask))
+ pr_perror("Can't restore sigmask for %d", pid);
+err_sig:
+ return -1;
+}
+
+static int restore_thread_ctx(int pid, struct thread_ctx *ctx)
+{
+ int ret = 0;
+
+ if (ptrace_set_regs(pid, &ctx->regs)) {
+ pr_perror("Can't restore registers (pid: %d)", pid);
+ ret = -1;
+ }
+ if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) {
+ pr_perror("Can't block signals");
+ ret = -1;
+ }
+
+ return ret;
+}
+
+
+/* we run at @regs->ip */
+static int parasite_trap(struct parasite_ctl *ctl, pid_t pid,
+ user_regs_struct_t *regs,
+ struct thread_ctx *octx)
+{
+ siginfo_t siginfo;
+ int status;
+ int ret = -1;
+
+ /*
+ * Most ideas are taken from Tejun Heo's parasite thread
+ * https://code.google.com/p/ptrace-parasite/
+ */
+
+ if (wait4(pid, &status, __WALL, NULL) != pid) {
+ pr_perror("Waited pid mismatch (pid: %d)", pid);
+ goto err;
+ }
+
+ if (!WIFSTOPPED(status)) {
+ pr_err("Task is still running (pid: %d)\n", pid);
+ goto err;
+ }
+
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
+ pr_perror("Can't get siginfo (pid: %d)", pid);
+ goto err;
+ }
+
+ if (ptrace_get_regs(pid, regs)) {
+ pr_perror("Can't obtain registers (pid: %d)", pid);
+ goto err;
+ }
+
+ if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) {
+ pr_debug("** delivering signal %d si_code=%d\n",
+ siginfo.si_signo, siginfo.si_code);
+
+ pr_err("Unexpected %d task interruption, aborting\n", pid);
+ goto err;
+ }
+
+ /*
+ * We've reached this point if int3 is triggered inside our
+ * parasite code. So we're done.
+ */
+ ret = 0;
+err:
+ if (restore_thread_ctx(pid, octx))
+ ret = -1;
+
+ return ret;
+}
+
+
+int compel_execute_syscall(struct parasite_ctl *ctl,
+ user_regs_struct_t *regs, const char *code_syscall)
+{
+ pid_t pid = ctl->rpid;
+ int err;
+ uint8_t code_orig[BUILTIN_SYSCALL_SIZE];
+
+ /*
+ * Inject syscall instruction and remember original code,
+ * we will need it to restore original program content.
+ */
+ memcpy(code_orig, code_syscall, sizeof(code_orig));
+ if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip,
+ (void *)code_orig, sizeof(code_orig))) {
+ pr_err("Can't inject syscall blob (pid: %d)\n", pid);
+ return -1;
+ }
+
+ err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig);
+ if (!err)
+ err = parasite_trap(ctl, pid, regs, &ctl->orig);
+
+ if (ptrace_poke_area(pid, (void *)code_orig,
+ (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) {
+ pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid);
+ err = -1;
+ }
+
+ return err;
+}
+
+static int accept_tsock(struct parasite_ctl *ctl)
+{
+ int sock;
+ int ask = -ctl->tsock; /* this '-' is explained above */
+
+ sock = accept(ask, NULL, 0);
+ if (sock < 0) {
+ pr_perror("Can't accept connection to the transport socket");
+ close(ask);
+ return -1;
+ }
+
+ ctl->tsock = sock;
+ return 0;
+}
+
+static int parasite_init_daemon(struct parasite_ctl *ctl)
+{
+ struct parasite_init_args *args;
+ pid_t pid = ctl->rpid;
+ user_regs_struct_t regs;
+ struct ctl_msg m = { };
+
+ *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON;
+
+ args = compel_parasite_args(ctl, struct parasite_init_args);
+
+ args->sigframe = (uintptr_t)ctl->rsigframe;
+ args->log_level = compel_log_get_loglevel();
+
+ futex_set(&args->daemon_connected, 0);
+
+ if (prepare_tsock(ctl, pid, args))
+ goto err;
+
+ /* after this we can catch parasite errors in chld handler */
+ if (setup_child_handler(ctl))
+ goto err;
+
+ regs = ctl->orig.regs;
+ if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, &regs, &ctl->orig))
+ goto err;
+
+ futex_wait_while_eq(&args->daemon_connected, 0);
+ if (futex_get(&args->daemon_connected) != 1) {
+ errno = -(int)futex_get(&args->daemon_connected);
+ pr_perror("Unable to connect a transport socket");
+ goto err;
+ }
+
+ if (accept_tsock(ctl) < 0)
+ goto err;
+
+ if (compel_util_send_fd(ctl, ctl->ictx.log_fd))
+ goto err;
+
+ pr_info("Wait for parasite being daemonized...\n");
+
+ if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) {
+ pr_err("Can't switch parasite %d to daemon mode %d\n",
+ pid, m.err);
+ goto err;
+ }
+
+ ctl->sigreturn_addr = (void*)(uintptr_t)args->sigreturn_addr;
+ ctl->daemonized = true;
+ pr_info("Parasite %d has been switched to daemon mode\n", pid);
+ return 0;
+err:
+ return -1;
+}
+
+static int parasite_start_daemon(struct parasite_ctl *ctl)
+{
+ pid_t pid = ctl->rpid;
+ struct infect_ctx *ictx = &ctl->ictx;
+
+ /*
+ * Get task registers before going daemon, since the
+ * compel_get_task_regs needs to call ptrace on _stopped_ task,
+ * while in daemon it is not such.
+ */
+
+ if (compel_get_task_regs(pid, ctl->orig.regs, ictx->save_regs, ictx->regs_arg)) {
+ pr_err("Can't obtain regs for thread %d\n", pid);
+ return -1;
+ }
+
+ if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask))
+ return -1;
+
+ if (parasite_init_daemon(ctl))
+ return -1;
+
+ return 0;
+}
+
+static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ int fd;
+
+ ctl->remote_map = remote_mmap(ctl, NULL, size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+ if (!ctl->remote_map) {
+ pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->rpid);
+ return -1;
+ }
+
+ ctl->map_length = round_up(size, page_size());
+
+ fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%p-%p",
+ ctl->remote_map, ctl->remote_map + ctl->map_length);
+ if (fd < 0)
+ return -1;
+
+ ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FILE, fd, 0);
+ close(fd);
+
+ if (ctl->local_map == MAP_FAILED) {
+ ctl->local_map = NULL;
+ pr_perror("Can't map remote parasite map");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE;
+ uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME;
+ pid_t pid = ctl->rpid;
+ unsigned long sret = -ENOSYS;
+ int ret, fd, lfd;
+ bool __maybe_unused compat_task = !compel_mode_native(ctl);
+
+ if (ctl->ictx.flags & INFECT_NO_MEMFD)
+ return 1;
+
+ BUILD_BUG_ON(sizeof(orig_code) < sizeof(long));
+
+ if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) {
+ pr_err("Can't inject memfd args (pid: %d)\n", pid);
+ return -1;
+ }
+
+ ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret,
+ (unsigned long)where, 0, 0, 0, 0, 0);
+
+ if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) {
+ fd = (int)(long)sret;
+ if (fd >= 0)
+ compel_syscall(ctl, __NR(close, compat_task), &sret,
+ fd, 0, 0, 0, 0, 0);
+ pr_err("Can't restore memfd args (pid: %d)\n", pid);
+ return -1;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ fd = (int)(long)sret;
+ if (fd == -ENOSYS)
+ return 1;
+ if (fd < 0)
+ return fd;
+
+ ctl->map_length = round_up(size, page_size());
+ lfd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "fd/%d", fd);
+ if (lfd < 0)
+ goto err_cure;
+
+ if (ftruncate(lfd, ctl->map_length) < 0) {
+ pr_perror("Fail to truncate memfd for parasite");
+ goto err_cure;
+ }
+
+ ctl->remote_map = remote_mmap(ctl, NULL, size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FILE | MAP_SHARED, fd, 0);
+ if (!ctl->remote_map) {
+ pr_err("Can't rmap memfd for parasite blob\n");
+ goto err_curef;
+ }
+
+ ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FILE, lfd, 0);
+ if (ctl->local_map == MAP_FAILED) {
+ ctl->local_map = NULL;
+ pr_perror("Can't lmap memfd for parasite blob");
+ goto err_curef;
+ }
+
+ compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0);
+ close(lfd);
+
+ pr_info("Set up parasite blob using memfd\n");
+ return 0;
+
+err_curef:
+ close(lfd);
+err_cure:
+ compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0);
+ return -1;
+}
+
+void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs)
+{
+ size_t i, j;
+
+ for (i = 0, j = 0; i < nr_relocs; i++) {
+ if (elf_relocs[i].type & COMPEL_TYPE_LONG) {
+ long *where = mem + elf_relocs[i].offset;
+ long *p = mem + size;
+
+ if (elf_relocs[i].type & COMPEL_TYPE_GOTPCREL) {
+ int *value = (int *)where;
+ int rel;
+
+ p[j] = (long)vbase + elf_relocs[i].value;
+ rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend;
+
+ *value = rel;
+ j++;
+ } else
+ *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase;
+ } else if (elf_relocs[i].type & COMPEL_TYPE_INT) {
+ int *where = (mem + elf_relocs[i].offset);
+ *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase;
+ } else
+ BUG();
+ }
+}
+
+int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ int ret;
+
+ ret = parasite_memfd_exchange(ctl, size);
+ if (ret == 1) {
+ pr_info("MemFD parasite doesn't work, goto legacy mmap\n");
+ ret = parasite_mmap_exchange(ctl, size);
+ }
+ return ret;
+}
+
+int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size)
+{
+ int ret;
+ unsigned long p, map_exchange_size, parasite_size = 0;
+
+ if (ctl->ictx.log_fd < 0)
+ goto err;
+
+ if (!arch_can_dump_task(ctl))
+ goto err;
+
+ /*
+ * Inject a parasite engine. Ie allocate memory inside alien
+ * space and copy engine code there. Then re-map the engine
+ * locally, so we will get an easy way to access engine memory
+ * without using ptrace at all.
+ */
+
+ parasite_size = ctl->pblob.size;
+
+ ctl->args_size = round_up(args_size, PAGE_SIZE);
+ parasite_size += ctl->args_size;
+
+ map_exchange_size = parasite_size;
+ map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE;
+ if (nr_threads > 1)
+ map_exchange_size += PARASITE_STACK_SIZE;
+
+ ret = compel_map_exchange(ctl, map_exchange_size);
+ if (ret)
+ goto err;
+
+ pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map);
+
+ ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.parasite_ip_off);
+ ctl->addr_cmd = ctl->local_map + ctl->pblob.addr_cmd_off;
+ ctl->addr_args = ctl->local_map + ctl->pblob.addr_arg_off;
+
+ memcpy(ctl->local_map, ctl->pblob.mem, ctl->pblob.size);
+ if (ctl->pblob.nr_relocs)
+ compel_relocs_apply(ctl->local_map, ctl->remote_map, ctl->pblob.bsize,
+ ctl->pblob.relocs, ctl->pblob.nr_relocs);
+
+ p = parasite_size;
+
+ ctl->rsigframe = ctl->remote_map + p;
+ ctl->sigframe = ctl->local_map + p;
+
+ p += RESTORE_STACK_SIGFRAME;
+ p += PARASITE_STACK_SIZE;
+ ctl->rstack = ctl->remote_map + p;
+
+ if (nr_threads > 1) {
+ p += PARASITE_STACK_SIZE;
+ ctl->r_thread_stack = ctl->remote_map + p;
+ }
+
+ if (parasite_start_daemon(ctl))
+ goto err;
+
+ return 0;
+
+err:
+ return -1;
+}
+
+int compel_prepare_thread(int pid, struct thread_ctx *ctx)
+{
+ if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) {
+ pr_perror("can't get signal blocking mask for %d", pid);
+ return -1;
+ }
+
+ if (ptrace_get_regs(pid, &ctx->regs)) {
+ pr_perror("Can't obtain registers (pid: %d)", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+struct parasite_ctl *compel_prepare(int pid)
+{
+ struct parasite_ctl *ctl = NULL;
+
+ /*
+ * Control block early setup.
+ */
+ ctl = xzalloc(sizeof(*ctl));
+ if (!ctl) {
+ pr_err("Parasite control block allocation failed (pid: %d)\n", pid);
+ goto err;
+ }
+
+ ctl->tsock = -1;
+ ctl->ictx.log_fd = -1;
+
+ if (compel_prepare_thread(pid, &ctl->orig))
+ goto err;
+
+ ctl->rpid = pid;
+
+ BUILD_BUG_ON(PARASITE_START_AREA_MIN < BUILTIN_SYSCALL_SIZE + MEMFD_FNAME_SZ);
+
+ return ctl;
+
+err:
+ xfree(ctl);
+ return NULL;
+}
+
+static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs)
+{
+ void *addr = (void *) REG_IP(*regs);
+ return addr >= ctl->remote_map &&
+ addr < ctl->remote_map + ctl->map_length;
+}
+
+static int parasite_fini_seized(struct parasite_ctl *ctl)
+{
+ pid_t pid = ctl->rpid;
+ user_regs_struct_t regs;
+ int status, ret = 0;
+ enum trace_flags flag;
+
+ /* stop getting chld from parasite -- we're about to step-by-step it */
+ if (restore_child_handler())
+ return -1;
+
+ /* Start to trace syscalls for each thread */
+ if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) {
+ pr_perror("Unable to interrupt the process");
+ return -1;
+ }
+
+ pr_debug("Waiting for %d to trap\n", pid);
+ if (wait4(pid, &status, __WALL, NULL) != pid) {
+ pr_perror("Waited pid mismatch (pid: %d)", pid);
+ return -1;
+ }
+
+ pr_debug("Daemon %d exited trapping\n", pid);
+ if (!WIFSTOPPED(status)) {
+ pr_err("Task is still running (pid: %d)\n", pid);
+ return -1;
+ }
+
+ ret = ptrace_get_regs(pid, &regs);
+ if (ret) {
+ pr_perror("Unable to get registers");
+ return -1;
+ }
+
+ if (!task_in_parasite(ctl, &regs)) {
+ pr_err("The task is not in parasite code\n");
+ return -1;
+ }
+
+ ret = compel_rpc_call(PARASITE_CMD_FINI, ctl);
+ close_safe(&ctl->tsock);
+ if (ret)
+ return -1;
+
+ /* Go to sigreturn as closer as we can */
+ ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag,
+ ctl->ictx.flags & INFECT_NO_BREAKPOINTS);
+ if (ret < 0)
+ return ret;
+
+ if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0),
+ __NR(rt_sigreturn, 1), flag))
+ return -1;
+
+ if (ptrace_flush_breakpoints(pid))
+ return -1;
+
+ /*
+ * All signals are unblocked now. The kernel notifies about leaving
+ * syscall before starting to deliver signals. All parasite code are
+ * executed with blocked signals, so we can sefly unmap a parasite blob.
+ */
+
+ return 0;
+}
+
+int compel_stop_daemon(struct parasite_ctl *ctl)
+{
+ if (ctl->daemonized) {
+ /*
+ * Looks like a previous attempt failed, we should do
+ * nothing in this case. parasite will try to cure itself.
+ */
+ if (ctl->tsock < 0)
+ return -1;
+
+ if (parasite_fini_seized(ctl)) {
+ close_safe(&ctl->tsock);
+ return -1;
+ }
+ }
+
+ ctl->daemonized = false;
+
+ return 0;
+}
+
+int compel_cure_remote(struct parasite_ctl *ctl)
+{
+ if (compel_stop_daemon(ctl))
+ return -1;
+
+ if (!ctl->remote_map)
+ return 0;
+
+ /* Unseizing task with parasite -- it does it himself */
+ if (ctl->addr_cmd) {
+ struct parasite_unmap_args *args;
+
+ *ctl->addr_cmd = PARASITE_CMD_UNMAP;
+
+ args = compel_parasite_args(ctl, struct parasite_unmap_args);
+ args->parasite_start = ctl->remote_map;
+ args->parasite_len = ctl->map_length;
+ if (compel_unmap(ctl, ctl->parasite_ip))
+ return -1;
+ } else {
+ unsigned long ret;
+
+ compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret,
+ (unsigned long)ctl->remote_map, ctl->map_length,
+ 0, 0, 0, 0);
+ if (ret) {
+ pr_err("munmap for remote map %p, %lu returned %lu\n",
+ ctl->remote_map, ctl->map_length, ret);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int compel_cure_local(struct parasite_ctl *ctl)
+{
+ int ret = 0;
+
+ if (ctl->local_map) {
+ if (munmap(ctl->local_map, ctl->map_length)) {
+ pr_err("munmap failed (pid: %d)\n", ctl->rpid);
+ ret = -1;
+ }
+ }
+
+ free(ctl);
+ return ret;
+}
+
+int compel_cure(struct parasite_ctl *ctl)
+{
+ int ret;
+
+ ret = compel_cure_remote(ctl);
+ if (!ret)
+ ret = compel_cure_local(ctl);
+
+ return ret;
+}
+
+void *compel_parasite_args_p(struct parasite_ctl *ctl)
+{
+ return ctl->addr_args;
+}
+
+void *compel_parasite_args_s(struct parasite_ctl *ctl, int args_size)
+{
+ BUG_ON(args_size > ctl->args_size);
+ return compel_parasite_args_p(ctl);
+}
+
+int compel_run_in_thread(pid_t pid, unsigned int cmd,
+ struct parasite_ctl *ctl,
+ struct thread_ctx *octx)
+{
+ void *stack = ctl->r_thread_stack;
+ user_regs_struct_t regs = octx->regs;
+ int ret;
+
+ *ctl->addr_cmd = cmd;
+
+ ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, &regs, octx);
+ if (ret == 0)
+ ret = parasite_trap(ctl, pid, &regs, octx);
+ if (ret == 0)
+ ret = (int)REG_RES(regs);
+
+ if (ret)
+ pr_err("Parasite exited with %d\n", ret);
+
+ return ret;
+}
+
+/*
+ * compel_unmap() is used for unmapping parasite and restorer blobs.
+ * A blob can contain code for unmapping itself, so the porcess is
+ * trapped on the exit from the munmap syscall.
+ */
+int compel_unmap(struct parasite_ctl *ctl, unsigned long addr)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ pid_t pid = ctl->rpid;
+ int ret = -1;
+
+ ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, &regs, &ctl->orig);
+ if (ret)
+ goto err;
+
+ ret = compel_stop_on_syscall(1, __NR(munmap, 0),
+ __NR(munmap, 1), TRACE_ENTER);
+
+ if (restore_thread_ctx(pid, &ctl->orig))
+ ret = -1;
+err:
+ return ret;
+}
+
+int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp)
+{
+ int ret;
+
+ if (no_bp) {
+ pr_debug("Force no-breakpoints restore\n");
+ ret = 0;
+ } else
+ ret = ptrace_set_breakpoint(pid, addr);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0) {
+ /*
+ * PIE will stop on a breakpoint, next
+ * stop after that will be syscall enter.
+ */
+ *tf = TRACE_EXIT;
+ return 0;
+ }
+
+ /*
+ * No breakpoints available -- start tracing it
+ * in a per-syscall manner.
+ */
+ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("Unable to restart the %d process", pid);
+ return -1;
+ }
+
+ *tf = TRACE_ENTER;
+ return 0;
+}
+
+static bool task_is_trapped(int status, pid_t pid)
+{
+ if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ return true;
+
+ pr_err("Task %d is in unexpected state: %x\n", pid, status);
+ if (WIFEXITED(status))
+ pr_err("Task exited with %d\n", WEXITSTATUS(status));
+ if (WIFSIGNALED(status))
+ pr_err("Task signaled with %d: %s\n",
+ WTERMSIG(status), strsignal(WTERMSIG(status)));
+ if (WIFSTOPPED(status))
+ pr_err("Task stopped with %d: %s\n",
+ WSTOPSIG(status), strsignal(WSTOPSIG(status)));
+ if (WIFCONTINUED(status))
+ pr_err("Task continued\n");
+
+ return false;
+}
+
+static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid,
+ const int sys_nr, const int sys_nr_compat)
+{
+ const char *mode = user_regs_native(regs) ? "native" : "compat";
+ int req_sysnr = user_regs_native(regs) ? sys_nr : sys_nr_compat;
+
+ pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n",
+ pid, mode, REG_SYSCALL_NR(*regs), req_sysnr);
+
+ return (REG_SYSCALL_NR(*regs) == req_sysnr);
+}
+
+/*
+ * Trap tasks on the exit from the specified syscall
+ *
+ * tasks - number of processes, which should be trapped
+ * sys_nr - the required syscall number
+ * sys_nr_compat - the required compatible syscall number
+ */
+int compel_stop_on_syscall(int tasks,
+ const int sys_nr, const int sys_nr_compat,
+ enum trace_flags trace)
+{
+ user_regs_struct_t regs;
+ int status, ret;
+ pid_t pid;
+
+ if (tasks > 1)
+ trace = TRACE_ALL;
+
+ /* Stop all threads on the enter point in sys_rt_sigreturn */
+ while (tasks) {
+ pid = wait4(-1, &status, __WALL, NULL);
+ if (pid == -1) {
+ pr_perror("wait4 failed");
+ return -1;
+ }
+
+ if (!task_is_trapped(status, pid))
+ return -1;
+
+ pr_debug("%d was trapped\n", pid);
+
+ if (trace == TRACE_EXIT) {
+ trace = TRACE_ENTER;
+ pr_debug("`- Expecting exit\n");
+ goto goon;
+ }
+ if (trace == TRACE_ENTER)
+ trace = TRACE_EXIT;
+
+ ret = ptrace_get_regs(pid, &regs);
+ if (ret) {
+ pr_perror("ptrace");
+ return -1;
+ }
+
+ if (is_required_syscall(&regs, pid, sys_nr, sys_nr_compat)) {
+ /*
+ * The process is going to execute the required syscall,
+ * the next stop will be on the exit from this syscall
+ */
+ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("ptrace");
+ return -1;
+ }
+
+ pid = wait4(pid, &status, __WALL, NULL);
+ if (pid == -1) {
+ pr_perror("wait4 failed");
+ return -1;
+ }
+
+ if (!task_is_trapped(status, pid))
+ return -1;
+
+ pr_debug("%d was stopped\n", pid);
+ tasks--;
+ continue;
+ }
+goon:
+ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("ptrace");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int compel_mode_native(struct parasite_ctl *ctl)
+{
+ return user_regs_native(&ctl->orig.regs);
+}
+
+k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl)
+{
+ return &ctl->orig.sigmask;
+}
+
+struct infect_ctx *compel_infect_ctx(struct parasite_ctl *ctl)
+{
+ return &ctl->ictx;
+}
+
+struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *ctl)
+{
+ return &ctl->pblob;
+}
diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c
new file mode 100644
index 000000000..c2991b5d8
--- /dev/null
+++ b/compel/src/lib/ptrace.c
@@ -0,0 +1,100 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <elf.h>
+#include <sys/uio.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include "common/compiler.h"
+
+#include "uapi/compel/asm/infect-types.h"
+#include "uapi/compel/ptrace.h"
+
+#include "log.h"
+
+int suspend_seccomp(pid_t pid)
+{
+ if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
+ pr_perror("suspending seccomp failed");
+ return -1;
+ }
+
+ return 0;
+}
+
+int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes)
+{
+ unsigned long w;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *d = dst, *a = addr;
+ d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
+ if (d[w] == -1U && errno)
+ goto err;
+ }
+ return 0;
+err:
+ return -2;
+}
+
+int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes)
+{
+ unsigned long w;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *s = src, *a = addr;
+ if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w]))
+ goto err;
+ }
+ return 0;
+err:
+ return -2;
+}
+
+/* don't swap big space, it might overflow the stack */
+int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes)
+{
+ void *t = alloca(bytes);
+
+ if (ptrace_peek_area(pid, t, dst, bytes))
+ return -1;
+
+ if (ptrace_poke_area(pid, src, dst, bytes)) {
+ if (ptrace_poke_area(pid, t, dst, bytes))
+ return -2;
+ return -1;
+ }
+
+ memcpy(src, t, bytes);
+
+ return 0;
+}
+
+int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) {
+ struct iovec iov;
+
+ iov.iov_base = regs;
+ iov.iov_len = sizeof(user_regs_struct_t);
+ return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov);
+}
+
+int __attribute__((weak)) ptrace_set_regs(int pid, user_regs_struct_t *regs)
+{
+ struct iovec iov;
+
+ iov.iov_base = regs;
+ iov.iov_len = sizeof(user_regs_struct_t);
+ return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov);
+}