From cbe24fb9c438f147fdd70c41f2a66844457ffed4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 31 Oct 2016 15:06:48 +0300 Subject: compel: Move in parasite engine This is the final patch in the series. It does a bunch of renames and fixes headers respectively. Signed-off-by: Cyrill Gorcunov Signed-off-by: Andrei Vagin --- compel/Makefile | 5 + compel/arch/aarch64/src/lib/include/ptrace.h | 15 - .../aarch64/src/lib/include/uapi/asm/breakpoints.h | 15 + .../src/lib/include/uapi/asm/infect-types.h | 57 + compel/arch/aarch64/src/lib/infect.c | 111 ++ compel/arch/arm/src/lib/include/ptrace.h | 15 - .../arm/src/lib/include/uapi/asm/breakpoints.h | 15 + .../arm/src/lib/include/uapi/asm/infect-types.h | 91 ++ .../arch/arm/src/lib/include/uapi/asm/sigframe.h | 2 + compel/arch/arm/src/lib/infect.c | 122 ++ compel/arch/ppc64/src/lib/include/ptrace.h | 15 - .../ppc64/src/lib/include/uapi/asm/breakpoints.h | 15 + .../ppc64/src/lib/include/uapi/asm/infect-types.h | 110 ++ compel/arch/ppc64/src/lib/infect.c | 318 +++++ compel/arch/x86/src/lib/include/ptrace.h | 6 - .../x86/src/lib/include/uapi/asm/breakpoints.h | 6 + .../x86/src/lib/include/uapi/asm/infect-types.h | 159 +++ compel/arch/x86/src/lib/infect.c | 351 ++++++ compel/include/infect-priv.h | 59 + compel/include/rpc-pie-priv.h | 48 + compel/include/uapi/compel.h | 6 + compel/include/uapi/infect-rpc.h | 17 + compel/include/uapi/infect-util.h | 5 + compel/include/uapi/infect.h | 147 +++ compel/include/uapi/ptrace.h | 76 ++ compel/src/lib/infect-rpc.c | 101 ++ compel/src/lib/infect-util.c | 21 + compel/src/lib/infect.c | 1262 ++++++++++++++++++++ compel/src/lib/ptrace.c | 100 ++ 29 files changed, 3219 insertions(+), 51 deletions(-) delete mode 100644 compel/arch/aarch64/src/lib/include/ptrace.h create mode 100644 compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/aarch64/src/lib/infect.c delete mode 100644 compel/arch/arm/src/lib/include/ptrace.h create mode 100644 compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/arm/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/arm/src/lib/infect.c delete mode 100644 compel/arch/ppc64/src/lib/include/ptrace.h create mode 100644 compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/ppc64/src/lib/infect.c delete mode 100644 compel/arch/x86/src/lib/include/ptrace.h create mode 100644 compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/x86/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/x86/src/lib/infect.c create mode 100644 compel/include/infect-priv.h create mode 100644 compel/include/rpc-pie-priv.h create mode 100644 compel/include/uapi/infect-rpc.h create mode 100644 compel/include/uapi/infect-util.h create mode 100644 compel/include/uapi/infect.h create mode 100644 compel/include/uapi/ptrace.h create mode 100644 compel/src/lib/infect-rpc.c create mode 100644 compel/src/lib/infect-util.c create mode 100644 compel/src/lib/infect.c create mode 100644 compel/src/lib/ptrace.c (limited to 'compel') diff --git a/compel/Makefile b/compel/Makefile index d421bc078..ad98e9d6d 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -23,6 +23,11 @@ lib-y += src/lib/log.o host-lib-y += src/lib/log.o lib-y += arch/$(ARCH)/src/lib/cpu.o +lib-y += arch/$(ARCH)/src/lib/infect.o +lib-y += src/lib/infect-rpc.o +lib-y += src/lib/infect-util.o +lib-y += src/lib/infect.o +lib-y += src/lib/ptrace.o ifeq ($(ARCH),x86) lib-y += src/lib/handle-elf-32.o diff --git a/compel/arch/aarch64/src/lib/include/ptrace.h b/compel/arch/aarch64/src/lib/include/ptrace.h deleted file mode 100644 index e18454df2..000000000 --- a/compel/arch/aarch64/src/lib/include/ptrace.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __COMPEL_PTRACE_H__ -#define __COMPEL_PTRACE_H__ -#define ARCH_SI_TRAP TRAP_BRKPT - -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} - -#endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..5f090490d --- /dev/null +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..714881c57 --- /dev/null +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,57 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include +#include "common/page.h" + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h + * + * A thread ARM CPU context + */ + +typedef struct user_pt_regs user_regs_struct_t; +typedef struct user_fpsimd_state user_fpregs_struct_t; + +#define REG_RES(r) ((uint64_t)(r).regs[0]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) + +#define user_regs_native(pregs) true + +/* + * Range for task size calculated from the following Linux kernel files: + * arch/arm64/include/asm/memory.h + * arch/arm64/Kconfig + * + * TODO: handle 32 bit tasks + */ +#define TASK_SIZE_MIN (1UL << 39) +#define TASK_SIZE_MAX (1UL << 48) + +static inline unsigned long task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} + +#define AT_VECTOR_SIZE 40 + +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c new file mode 100644 index 000000000..fceea2816 --- /dev/null +++ b/compel/arch/aarch64/src/lib/infect.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ + 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ +}; + +static const int +code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg) +{ + struct iovec iov; + user_fpregs_struct_t fpsimd; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = ®s; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + iov.iov_base = &fpsimd; + iov.iov_len = sizeof(fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(arg, ®s, &fpsimd); +err: + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.regs[8] = (unsigned long)nr; + regs.regs[0] = arg1; + regs.regs[1] = arg2; + regs.regs[2] = arg3; + regs.regs[3] = arg4; + regs.regs[4] = arg5; + regs.regs[5] = arg6; + regs.regs[6] = 0; + regs.regs[7] = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[0]; + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + unsigned long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here + */ + return true; +} diff --git a/compel/arch/arm/src/lib/include/ptrace.h b/compel/arch/arm/src/lib/include/ptrace.h deleted file mode 100644 index e18454df2..000000000 --- a/compel/arch/arm/src/lib/include/ptrace.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __COMPEL_PTRACE_H__ -#define __COMPEL_PTRACE_H__ -#define ARCH_SI_TRAP TRAP_BRKPT - -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} - -#endif diff --git a/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..5f090490d --- /dev/null +++ b/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..9c2092e5d --- /dev/null +++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,91 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include "common/page.h" + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h + * + * A thread ARM CPU context + */ + +typedef struct { + long uregs[18]; +} user_regs_struct_t; + +typedef struct user_vfp user_fpregs_struct_t; + +#define ARM_cpsr uregs[16] +#define ARM_pc uregs[15] +#define ARM_lr uregs[14] +#define ARM_sp uregs[13] +#define ARM_ip uregs[12] +#define ARM_fp uregs[11] +#define ARM_r10 uregs[10] +#define ARM_r9 uregs[9] +#define ARM_r8 uregs[8] +#define ARM_r7 uregs[7] +#define ARM_r6 uregs[6] +#define ARM_r5 uregs[5] +#define ARM_r4 uregs[4] +#define ARM_r3 uregs[3] +#define ARM_r2 uregs[2] +#define ARM_r1 uregs[1] +#define ARM_r0 uregs[0] +#define ARM_ORIG_r0 uregs[17] + + +/* Copied from arch/arm/include/asm/user.h */ + +struct user_vfp { + unsigned long long fpregs[32]; + unsigned long fpscr; +}; + +struct user_vfp_exc { + unsigned long fpexc; + unsigned long fpinst; + unsigned long fpinst2; +}; + +#define REG_RES(regs) ((regs).ARM_r0) +#define REG_IP(regs) ((regs).ARM_pc) +#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) + +#define user_regs_native(pregs) true + +/* + * Range for task size calculated from the following Linux kernel files: + * arch/arm/include/asm/memory.h + * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section) + */ +#define TASK_SIZE_MIN 0x3f000000 +#define TASK_SIZE_MAX 0xbf000000 +#define SZ_1G 0x40000000 + +static inline unsigned long task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G) + if (munmap((void *)task_size, page_size())) + break; + + return task_size; +} + +#define AT_VECTOR_SIZE 40 + +typedef uint32_t auxv_t; +typedef uint32_t tls_t; + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h b/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h index 65ae8a8b9..3e7bc0104 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h @@ -1,6 +1,8 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ +#include + /* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */ struct rt_sigcontext { diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c new file mode 100644 index 000000000..b440ff736 --- /dev/null +++ b/compel/arch/arm/src/lib/infect.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ + 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ +}; + +static const int +code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline __always_unused void __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +#define PTRACE_GETVFPREGS 27 +int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg) +{ + user_fpregs_struct_t vfp; + int ret = -1; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) { + pr_perror("Can't obtain FPU registers for %d", pid); + goto err; + } + + /* Did we come from a system call? */ + if ((int)regs.ARM_ORIG_r0 >= 0) { + /* Restart the system call */ + switch ((long)(int)regs.ARM_r0) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs.ARM_r0 = regs.ARM_ORIG_r0; + regs.ARM_pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs.ARM_r0 = __NR_restart_syscall; + regs.ARM_pc -= 4; + break; + } + } + + ret = save(arg, ®s, &vfp); +err: + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.ARM_r7 = (unsigned long)nr; + regs.ARM_r0 = arg1; + regs.ARM_r1 = arg2; + regs.ARM_r2 = arg3; + regs.ARM_r3 = arg4; + regs.ARM_r4 = arg5; + regs.ARM_r5 = arg6; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.ARM_r0; + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + unsigned long map; + int err; + + if (offset & ~PAGE_MASK) + return 0; + + err = compel_syscall(ctl, __NR_mmap2, &map, + (unsigned long)addr, length, prot, flags, fd, offset >> 12); + if (err < 0 || map > ctl->ictx.task_size) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->ARM_pc = new_ip; + if (stack) + regs->ARM_sp = (unsigned long)stack; + + /* Make sure flags are in known state */ + regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here + */ + return true; +} diff --git a/compel/arch/ppc64/src/lib/include/ptrace.h b/compel/arch/ppc64/src/lib/include/ptrace.h deleted file mode 100644 index 0274c2675..000000000 --- a/compel/arch/ppc64/src/lib/include/ptrace.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __COMPEL_PTRACE_H__ -#define __COMPEL_PTRACE_H__ -#define ARCH_SI_TRAP TRAP_BRKPT - -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} - -#endif diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..1ab89af76 --- /dev/null +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..f243def73 --- /dev/null +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,110 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include + +#define SIGMAX_OLD 31 +#define SIGMAX 64 + +/* + * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h + */ +typedef struct { + unsigned long gpr[32]; + unsigned long nip; + unsigned long msr; + unsigned long orig_gpr3; /* Used for restarting system calls */ + unsigned long ctr; + unsigned long link; + unsigned long xer; + unsigned long ccr; + unsigned long softe; /* Soft enabled/disabled */ + unsigned long trap; /* Reason for being here */ + /* + * N.B. for critical exceptions on 4xx, the dar and dsisr + * fields are overloaded to hold srr0 and srr1. + */ + unsigned long dar; /* Fault registers */ + unsigned long dsisr; /* on 4xx/Book-E used for ESR */ + unsigned long result; /* Result of a system call */ +} user_regs_struct_t; + +#define NVSXREG 32 + +#define USER_FPREGS_FL_FP 0x00001 +#define USER_FPREGS_FL_ALTIVEC 0x00002 +#define USER_FPREGS_FL_VSX 0x00004 +#define USER_FPREGS_FL_TM 0x00010 + +#ifndef NT_PPC_TM_SPR +# define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +# define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +# define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +# define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +# define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#endif + +#define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ +#define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ +#define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ +#define MSR_VEC (1UL<<25) +#define MSR_VSX (1UL<<23) + +#define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) + +typedef struct { + uint64_t fpregs[NFPREG]; + __vector128 vrregs[NVRREG]; + uint64_t vsxregs[NVSXREG]; + + int flags; + struct tm_regs { + int flags; + struct { + uint64_t tfhar, texasr, tfiar; + } tm_spr_regs; + user_regs_struct_t regs; + uint64_t fpregs[NFPREG]; + __vector128 vrregs[NVRREG]; + uint64_t vsxregs[NVSXREG]; + } tm; +} user_fpregs_struct_t; + +#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) +#define REG_IP(regs) ((uint64_t)(regs).nip) +#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) + +#define user_regs_native(pregs) true + +/* + * Copied from the following kernel header files : + * include/linux/auxvec.h + * arch/powerpc/include/uapi/asm/auxvec.h + * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 20 +#define AT_VECTOR_SIZE_ARCH 6 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + +typedef uint64_t auxv_t; + +/* Not used but the structure parasite_dump_thread needs a tls_t field */ +typedef uint64_t tls_t; + +/* + * Copied for the Linux kernel arch/powerpc/include/asm/processor.h + * + * NOTE: 32bit tasks are not supported. + */ +#define TASK_SIZE_USER64 (0x0000400000000000UL) +#define TASK_SIZE TASK_SIZE_USER64 + +static inline unsigned long task_size(void) { return TASK_SIZE; } + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c new file mode 100644 index 000000000..959098b8c --- /dev/null +++ b/compel/arch/ppc64/src/lib/infect.c @@ -0,0 +1,318 @@ +#include +#include +#include +#include +#include +#include +#include "uapi/compel/asm/infect-types.h" +#include "errno.h" +#include "log.h" +#include "common/bug.h" +#include "infect.h" +#include "infect-priv.h" + +#ifndef NT_PPC_TM_SPR +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#endif + +/* + * Injected syscall instruction + */ +const uint32_t code_syscall[] = { + 0x44000002, /* sc */ + 0x0fe00000 /* twi 31,0,0 */ +}; + +static inline void __check_code_syscall(void) +{ + BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +/* This is the layout of the POWER7 VSX registers and the way they + * overlap with the existing FPR and VMX registers. + * + * VSR doubleword 0 VSR doubleword 1 + * ---------------------------------------------------------------- + * VSR[0] | FPR[0] | | + * ---------------------------------------------------------------- + * VSR[1] | FPR[1] | | + * ---------------------------------------------------------------- + * | ... | | + * ---------------------------------------------------------------- + * VSR[30] | FPR[30] | | + * ---------------------------------------------------------------- + * VSR[31] | FPR[31] | | + * ---------------------------------------------------------------- + * VSR[32] | VR[0] | + * ---------------------------------------------------------------- + * VSR[33] | VR[1] | + * ---------------------------------------------------------------- + * | ... | + * ---------------------------------------------------------------- + * VSR[62] | VR[30] | + * ---------------------------------------------------------------- + * VSR[63] | VR[31] | + * ---------------------------------------------------------------- + * + * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR + * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE + * PTRACE_GETVSRREGS returns VSR[0..31] + * + * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need + * to save FPSCR too. + * + * There 32 VSX double word registers to save since the 32 first VSX double + * word registers are saved through FPR[0..32] and the remaining registers + * are saved when saving the Altivec registers VR[0..32]. + */ + +static int get_fpu_regs(pid_t pid, user_fpregs_struct_t *fp) +{ + if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fp->fpregs) < 0) { + pr_perror("Couldn't get floating-point registers"); + return -1; + } + fp->flags |= USER_FPREGS_FL_FP; + + return 0; +} + +static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) +{ + if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&fp->vrregs) < 0) { + /* PTRACE_GETVRREGS returns EIO if Altivec is not supported. + * This should not happen if msr_vec is set. */ + if (errno != EIO) { + pr_perror("Couldn't get Altivec registers"); + return -1; + } + pr_debug("Altivec not supported\n"); + } + else { + pr_debug("Dumping Altivec registers\n"); + fp->flags |= USER_FPREGS_FL_ALTIVEC; + } + return 0; +} + +/* + * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and + * FPR are saved through the FP state, there is no need to save the upper part + * of the first 32 VSX registers. + * Furthermore, the 32 last VSX registers are also the 32 Altivec registers + * already saved, so no need to save them. + * As a consequence, only the doubleword 1 of the 32 first VSX registers have + * to be saved (the ones are returned by PTRACE_GETVSRREGS). + */ +static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp) +{ + if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)fp->vsxregs) < 0) { + /* + * EIO is returned in the case PTRACE_GETVRREGS is not + * supported. + */ + if (errno != EIO) { + pr_perror("Couldn't get VSX registers"); + return -1; + } + pr_debug("VSX register's dump not supported.\n"); + } + else { + pr_debug("Dumping VSX registers\n"); + fp->flags |= USER_FPREGS_FL_VSX; + } + return 0; +} + +static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + pr_debug("Dumping TM registers\n"); + +#define TM_REQUIRED 0 +#define TM_OPTIONAL 1 +#define PTRACE_GET_TM(s,n,c,u) do { \ + iov.iov_base = &s; \ + iov.iov_len = sizeof(s); \ + if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \ + if (!u || errno != EIO) { \ + pr_perror("Couldn't get TM "n); \ + pr_err("Your kernel seems to not support the " \ + "new TM ptrace API (>= 4.8)\n"); \ + goto out_free; \ + } \ + pr_debug("TM "n" not supported.\n"); \ + iov.iov_base = NULL; \ + } \ +} while(0) + + /* Get special registers */ + PTRACE_GET_TM(fpregs->tm.tm_spr_regs, "SPR", NT_PPC_TM_SPR, TM_REQUIRED); + + /* Get checkpointed regular registers */ + PTRACE_GET_TM(fpregs->tm.regs, "GPR", NT_PPC_TM_CGPR, TM_REQUIRED); + + /* Get checkpointed FP registers */ + PTRACE_GET_TM(fpregs->tm.fpregs, "FPR", NT_PPC_TM_CFPR, TM_OPTIONAL); + if (iov.iov_base) + fpregs->tm.flags |= USER_FPREGS_FL_FP; + + /* Get checkpointed VMX (Altivec) registers */ + PTRACE_GET_TM(fpregs->tm.vrregs, "VMX", NT_PPC_TM_CVMX, TM_OPTIONAL); + if (iov.iov_base) + fpregs->tm.flags |= USER_FPREGS_FL_ALTIVEC; + + /* Get checkpointed VSX registers */ + PTRACE_GET_TM(fpregs->tm.vsxregs, "VSX", NT_PPC_TM_CVSX, TM_OPTIONAL); + if (iov.iov_base) + fpregs->tm.flags |= USER_FPREGS_FL_VSX; + + return 0; + +out_free: + return -1; /* still failing the checkpoint */ +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + /* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ +#ifndef TRAP +#define TRAP(r) ((r).trap & ~0xF) +#endif + + if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { + /* Restart the system call */ + switch (regs->gpr[3]) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + regs->gpr[0] = __NR_restart_syscall; + regs->nip -= 4; + break; + } + } + + /* Resetting trap since we are now coming from user space. */ + regs->trap = 0; + + fpregs->flags = 0; + /* + * Check for Transactional Memory operation in progress. + * Until we have support of TM register's state through the ptrace API, + * we can't checkpoint process with TM operation in progress (almost + * impossible) or suspended (easy to get). + */ + if (MSR_TM_ACTIVE(regs->msr)) { + pr_debug("Task %d has %s TM operation at 0x%lx\n", + pid, + (regs->msr & MSR_TMS) ? "a suspended" : "an active", + regs->nip); + if (get_tm_regs(pid, fpregs)) + return -1; + fpregs->flags = USER_FPREGS_FL_TM; + } + + if (get_fpu_regs(pid, fpregs)) + return -1; + + if (get_altivec_regs(pid, fpregs)) + return -1; + + if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { + /* + * Save the VSX registers if Altivec registers are supported + */ + if (get_vsx_regs(pid, fpregs)) + return -1; + } + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg) +{ + user_fpregs_struct_t fpregs; + int ret; + + ret = __get_task_regs(pid, ®s, &fpregs); + if (ret) + return ret; + + return save(arg, ®s, &fpregs); +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.gpr[0] = (unsigned long)nr; + regs.gpr[3] = arg1; + regs.gpr[4] = arg2; + regs.gpr[5] = arg3; + regs.gpr[6] = arg4; + regs.gpr[7] = arg5; + regs.gpr[8] = arg6; + + err = compel_execute_syscall(ctl, ®s, (char*)code_syscall); + + *ret = regs.gpr[3]; + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + unsigned long map = 0; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + /* + * OpenPOWER ABI requires that r12 is set to the calling function addressi + * to compute the TOC pointer. + */ + regs->gpr[12] = new_ip; + regs->nip = new_ip; + if (stack) + regs->gpr[1] = (unsigned long) stack; + regs->trap = 0; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: We should detect 32bit task when BE support is done. + */ + return true; +} diff --git a/compel/arch/x86/src/lib/include/ptrace.h b/compel/arch/x86/src/lib/include/ptrace.h deleted file mode 100644 index 844ea0efd..000000000 --- a/compel/arch/x86/src/lib/include/ptrace.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __COMPEL_PTRACE_H__ -#define __COMPEL_PTRACE_H__ -#define ARCH_SI_TRAP SI_KERNEL -extern int ptrace_set_breakpoint(pid_t pid, void *addr); -extern int ptrace_flush_breakpoints(pid_t pid); -#endif diff --git a/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..980f25d06 --- /dev/null +++ b/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP SI_KERNEL +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..bbc6bcf22 --- /dev/null +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,159 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include "log.h" +#include "common/bug.h" +#include "common/page.h" +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +typedef struct { + uint64_t r15; + uint64_t r14; + uint64_t r13; + uint64_t r12; + uint64_t bp; + uint64_t bx; + uint64_t r11; + uint64_t r10; + uint64_t r9; + uint64_t r8; + uint64_t ax; + uint64_t cx; + uint64_t dx; + uint64_t si; + uint64_t di; + uint64_t orig_ax; + uint64_t ip; + uint64_t cs; + uint64_t flags; + uint64_t sp; + uint64_t ss; + uint64_t fs_base; + uint64_t gs_base; + uint64_t ds; + uint64_t es; + uint64_t fs; + uint64_t gs; +} user_regs_struct64; + +typedef struct { + uint32_t bx; + uint32_t cx; + uint32_t dx; + uint32_t si; + uint32_t di; + uint32_t bp; + uint32_t ax; + uint32_t ds; + uint32_t es; + uint32_t fs; + uint32_t gs; + uint32_t orig_ax; + uint32_t ip; + uint32_t cs; + uint32_t flags; + uint32_t sp; + uint32_t ss; +} user_regs_struct32; + +#ifdef CONFIG_X86_64 +/* + * To be sure that we rely on inited reg->__is_native, this member + * is (short int) instead of initial (bool). The right way to + * check if regs are native or compat is to use user_regs_native() macro. + * This should cost nothing, as *usually* sizeof(bool) == sizeof(short) + */ +typedef struct { + union { + user_regs_struct64 native; + user_regs_struct32 compat; + }; + short __is_native; /* use user_regs_native macro to check it */ +} user_regs_struct_t; + +#define NATIVE_MAGIC 0x0A +#define COMPAT_MAGIC 0x0C +static inline bool user_regs_native(user_regs_struct_t *pregs) +{ + return pregs->__is_native == NATIVE_MAGIC; +} + +#define get_user_reg(pregs, name) \ + ((user_regs_native(pregs)) ? \ + ((pregs)->native.name) : \ + ((pregs)->compat.name)) + +#define set_user_reg(pregs, name, val) \ + ((user_regs_native(pregs)) ? \ + ((pregs)->native.name = (val)) : \ + ((pregs)->compat.name = (val))) +#else +typedef struct { + union { + user_regs_struct32 native; + }; +} user_regs_struct_t; +#define user_regs_native(pregs) true +#define get_user_reg(pregs, name) ((pregs)->native.name) +#define set_user_reg(pregs, name, val) ((pregs)->native.name = val) +#endif + +#if 0 +typedef struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; /* Note this is not the same as + the 32bit/x87/FSAVE twd */ + unsigned short fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + u32 padding[24]; +} user_fpregs_struct_t; +#endif + +typedef struct xsave_struct user_fpregs_struct_t; + +#ifdef CONFIG_X86_64 +# define TASK_SIZE ((1UL << 47) - PAGE_SIZE) +#else +/* + * Task size may be limited to 3G but we need a + * higher limit, because it's backward compatible. + */ +# define TASK_SIZE (0xffffe000) +#endif + +static inline unsigned long task_size(void) { return TASK_SIZE; } + +typedef uint64_t auxv_t; + +/* + * Linux preserves three TLS segments in GDT. + * Offsets in GDT differ between 32-bit and 64-bit machines. + * For 64-bit x86 those GDT offsets are the same + * for native and compat tasks. + */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 +#define GDT_ENTRY_TLS_NUM 3 +typedef struct { + user_desc_t desc[GDT_ENTRY_TLS_NUM]; +} tls_t; + +#define REG_RES(regs) get_user_reg(®s, ax) +#define REG_IP(regs) get_user_reg(®s, ip) +#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) + +#define AT_VECTOR_SIZE 44 + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c new file mode 100644 index 000000000..53cae1dc5 --- /dev/null +++ b/compel/arch/x86/src/lib/infect.c @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include "asm/cpu.h" + +#include +#include +#include "errno.h" +#include +#include +#include "asm/ptrace.h" +#include "common/err.h" +#include "asm/infect-types.h" +#include "uapi/compel/ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x0f, 0x05, /* syscall */ + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ +}; + +const char code_int_80[] = { + 0xcd, 0x80, /* int $0x80 */ + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ +}; + +static const int +code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); +static const int +code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline __always_unused void __check_code_syscall(void) +{ + BUILD_BUG_ON(code_int_80_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +#define get_signed_user_reg(pregs, name) \ + ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : \ + (int32_t)((pregs)->compat.name)) + +int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t save, void *arg) +{ + user_fpregs_struct_t xsave = { }, *xs = NULL; + + struct iovec iov; + int ret = -1; + + pr_info("Dumping general registers for %d in %s mode\n", pid, + user_regs_native(®s) ? "native" : "compat"); + + /* Did we come from a system call? */ + if (get_signed_user_reg(®s, orig_ax) >= 0) { + /* Restart the system call */ + switch (get_signed_user_reg(®s, ax)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + set_user_reg(®s, ax, get_user_reg(®s, orig_ax)); + set_user_reg(®s, ip, get_user_reg(®s, ip) - 2); + break; + case -ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + set_user_reg(®s, ax, -EINTR); + break; + } + } + +#ifndef PTRACE_GETREGSET +# define PTRACE_GETREGSET 0x4204 +#endif + + if (!cpu_has_feature(X86_FEATURE_FPU)) + goto out; + + /* + * FPU fetched either via fxsave or via xsave, + * thus decode it accrodingly. + */ + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (cpu_has_feature(X86_FEATURE_OSXSAVE)) { + iov.iov_base = &xsave; + iov.iov_len = sizeof(xsave); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { + pr_perror("Can't obtain FPU registers for %d", pid); + goto err; + } + } else { + if (ptrace(PTRACE_GETFPREGS, pid, NULL, &xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + goto err; + } + } + + xs = &xsave; +out: + ret = save(arg, ®s, xs); +err: + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + if (user_regs_native(®s)) { + user_regs_struct64 *r = ®s.native; + + r->ax = (uint64_t)nr; + r->di = arg1; + r->si = arg2; + r->dx = arg3; + r->r10 = arg4; + r->r8 = arg5; + r->r9 = arg6; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + } else { + user_regs_struct32 *r = ®s.compat; + + r->ax = (uint32_t)nr; + r->bx = arg1; + r->cx = arg2; + r->dx = arg3; + r->si = arg4; + r->di = arg5; + r->bp = arg6; + + err = compel_execute_syscall(ctl, ®s, code_int_80); + } + + *ret = get_user_reg(®s, ax); + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + unsigned long map; + int err; + bool compat_task = !user_regs_native(&ctl->orig.regs); + + err = compel_syscall(ctl, __NR(mmap, compat_task), &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0) + return NULL; + + if (IS_ERR_VALUE(map)) { + if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC)) + pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, " + "check selinux execmem policy\n", ctl->rpid); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + set_user_reg(regs, ip, new_ip); + if (stack) + set_user_reg(regs, sp, (unsigned long) stack); + + /* Avoid end of syscall processing */ + set_user_reg(regs, orig_ax, -1); + + /* Make sure flags are in known state */ + set_user_reg(regs, flags, get_user_reg(regs, flags) & + ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF)); +} + +#define USER32_CS 0x23 +#define USER_CS 0x33 + +static bool ldt_task_selectors(pid_t pid) +{ + unsigned long cs; + + errno = 0; + /* + * Offset of register must be from 64-bit set even for + * compatible tasks. Fix this to support native i386 tasks + */ + cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct64, cs), 0); + if (errno != 0) { + pr_perror("Can't get CS register for %d", pid); + return -1; + } + + return cs != USER_CS && cs != USER32_CS; +} + +static int arch_task_compatible(pid_t pid) +{ + user_regs_struct_t r; + int ret = ptrace_get_regs(pid, &r); + + if (ret) + return -1; + + return !user_regs_native(&r); +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + pid_t pid = ctl->rpid; + int ret; + + ret = arch_task_compatible(pid); + if (ret < 0) + return false; + + if (ret && !(ctl->ictx.flags & INFECT_HAS_COMPAT_SIGRETURN)) { + pr_err("Can't dump task %d running in 32-bit mode\n", pid); + return false; + } + + if (ldt_task_selectors(pid)) { + pr_err("Can't dump task %d with LDT descriptors\n", pid); + return false; + } + + return true; +} + +/* Copied from the gdb header gdb/nat/x86-dregs.h */ + +/* Debug registers' indices. */ +#define DR_FIRSTADDR 0 +#define DR_LASTADDR 3 +#define DR_NADDR 4 /* The number of debug address registers. */ +#define DR_STATUS 6 /* Index of debug status register (DR6). */ +#define DR_CONTROL 7 /* Index of debug control register (DR7). */ + +#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ +#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ +#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ + +/* Locally enable the break/watchpoint in the I'th debug register. */ +#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + int ret; + + /* Set a breakpoint */ + if (ptrace(PTRACE_POKEUSER, pid, + offsetof(struct user, u_debugreg[DR_FIRSTADDR]), + addr)) { + pr_perror("Unable to setup a breakpoint into %d", pid); + return -1; + } + + /* Enable the breakpoint */ + if (ptrace(PTRACE_POKEUSER, pid, + offsetof(struct user, u_debugreg[DR_CONTROL]), + X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { + pr_perror("Unable to enable the breakpoint for %d", pid); + return -1; + } + + ret = ptrace(PTRACE_CONT, pid, NULL, NULL); + if (ret) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + /* Disable the breakpoint */ + if (ptrace(PTRACE_POKEUSER, pid, + offsetof(struct user, u_debugreg[DR_CONTROL]), + 0)) { + pr_perror("Unable to disable the breakpoint for %d", pid); + return -1; + } + + return 0; +} + +int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs) +{ + struct iovec iov; + int ret; + + iov.iov_base = ®s->native; + iov.iov_len = sizeof(user_regs_struct64); + + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); + if (ret == -1) { + pr_perror("PTRACE_GETREGSET failed"); + return -1; + } + + if (iov.iov_len == sizeof(regs->native)) { + regs->__is_native = NATIVE_MAGIC; + return ret; + } + if (iov.iov_len == sizeof(regs->compat)) { + regs->__is_native = COMPAT_MAGIC; + return ret; + } + + pr_err("PTRACE_GETREGSET read %zu bytes for pid %d, but native/compat regs sizes are %zu/%zu bytes", + iov.iov_len, pid, + sizeof(regs->native), sizeof(regs->compat)); + return -1; +} + +int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + if (user_regs_native(regs)) { + iov.iov_base = ®s->native; + iov.iov_len = sizeof(user_regs_struct64); + } else { + iov.iov_base = ®s->compat; + iov.iov_len = sizeof(user_regs_struct32); + } + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h new file mode 100644 index 000000000..db5259c2c --- /dev/null +++ b/compel/include/infect-priv.h @@ -0,0 +1,59 @@ +#ifndef __COMPEL_INFECT_PRIV_H__ +#define __COMPEL_INFECT_PRIV_H__ + +#include + +#define BUILTIN_SYSCALL_SIZE 8 + +/* parasite control block */ +struct parasite_ctl { + int rpid; /* Real pid of the victim */ + void *remote_map; + void *local_map; + void *sigreturn_addr; /* A place for the breakpoint */ + unsigned long map_length; + + struct infect_ctx ictx; + + /* thread leader data */ + bool daemonized; + + struct thread_ctx orig; + + void *rstack; /* thread leader stack*/ + struct rt_sigframe *sigframe; + struct rt_sigframe *rsigframe; /* address in a parasite */ + + void *r_thread_stack; /* stack for non-leader threads */ + + unsigned long parasite_ip; /* service routine start ip */ + + unsigned int *addr_cmd; /* addr for command */ + void *addr_args; /* address for arguments */ + unsigned long args_size; + int tsock; /* transport socket for transferring fds */ + + struct parasite_blob_desc pblob; +}; + +#define MEMFD_FNAME "CRIUMFD" +#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME) + +struct ctl_msg; +int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m); + +/* XXX -- remove with cr-exec.c */ +extern int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size); +extern int compel_syscall(struct parasite_ctl *ctl, int nr, unsigned long *ret, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6); + + +extern void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs); +extern void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset); +extern bool arch_can_dump_task(struct parasite_ctl *ctl); + +#endif diff --git a/compel/include/rpc-pie-priv.h b/compel/include/rpc-pie-priv.h new file mode 100644 index 000000000..3d9091159 --- /dev/null +++ b/compel/include/rpc-pie-priv.h @@ -0,0 +1,48 @@ +#ifndef __COMPEL_RPC_H__ +#define __COMPEL_RPC_H__ +struct ctl_msg { + uint32_t cmd; /* command itself */ + uint32_t ack; /* ack on command */ + int32_t err; /* error code on reply */ +}; + +#define ctl_msg_cmd(_cmd) \ + (struct ctl_msg){.cmd = _cmd, } + +#define ctl_msg_ack(_cmd, _err) \ + (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, } + +/* + * NOTE: each command's args should be arch-independed sized. + * If you want to use one of the standard types, declare + * alternative type for compatible tasks in parasite-compat.h + */ +enum { + PARASITE_CMD_IDLE = 0, + PARASITE_CMD_ACK, + + PARASITE_CMD_INIT_DAEMON, + PARASITE_CMD_UNMAP, + + /* + * This must be greater than INITs. + */ + PARASITE_CMD_FINI, + + __PARASITE_END_CMDS, +}; + +struct parasite_init_args { + int32_t h_addr_len; + struct sockaddr_un h_addr; + int32_t log_level; + uint64_t sigreturn_addr; + uint64_t sigframe; /* pointer to sigframe */ + futex_t daemon_connected; +}; + +struct parasite_unmap_args { + uint64_t parasite_start; + uint64_t parasite_len; +}; +#endif diff --git a/compel/include/uapi/compel.h b/compel/include/uapi/compel.h index 278a85455..3554c1599 100644 --- a/compel/include/uapi/compel.h +++ b/compel/include/uapi/compel.h @@ -4,6 +4,8 @@ #include #include +#include + #define COMPEL_TYPE_INT (1u << 0) #define COMPEL_TYPE_LONG (1u << 1) #define COMPEL_TYPE_GOTPCREL (1u << 2) @@ -22,4 +24,8 @@ typedef void (*compel_log_fn)(unsigned int lvl, const char *fmt, va_list parms); extern void compel_log_init(compel_log_fn log_fn, unsigned int level); extern unsigned int compel_log_get_loglevel(void); +#include +#include +#include + #endif /* UAPI_COMPEL_H__ */ diff --git a/compel/include/uapi/infect-rpc.h b/compel/include/uapi/infect-rpc.h new file mode 100644 index 000000000..0176c1142 --- /dev/null +++ b/compel/include/uapi/infect-rpc.h @@ -0,0 +1,17 @@ +#ifndef __COMPEL_INFECT_RPC_H__ +#define __COMPEL_INFECT_RPC_H__ + +#include +#include +#include + +struct parasite_ctl; +extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); +extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int compel_rpc_sock(struct parasite_ctl *ctl); + +#define PARASITE_USER_CMDS 64 + + +#endif diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h new file mode 100644 index 000000000..bd2010c3b --- /dev/null +++ b/compel/include/uapi/infect-util.h @@ -0,0 +1,5 @@ +#ifndef __COMPEL_INFECT_UTIL_H__ +#define __COMPEL_INFECT_UTIL_H__ +struct parasite_ctl; +extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd); +#endif diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h new file mode 100644 index 000000000..38051f437 --- /dev/null +++ b/compel/include/uapi/infect.h @@ -0,0 +1,147 @@ +#ifndef __COMPEL_INFECT_H__ +#define __COMPEL_INFECT_H__ + +#include + +#include +#include +#include +#include + +#include "common/compiler.h" + +#define PARASITE_START_AREA_MIN (4096) + +extern int compel_stop_task(int pid); + +struct seize_task_status { + char state; + int ppid; + unsigned long long sigpnd; + unsigned long long shdpnd; + int seccomp_mode; +}; + +extern int compel_wait_task(int pid, int ppid, + int (*get_status)(int pid, struct seize_task_status *), + struct seize_task_status *st); +extern int compel_unseize_task(pid_t pid, int orig_state, int state); + +/* + * FIXME -- these should be mapped to pid.h's + */ + +#define TASK_ALIVE 0x1 +#define TASK_DEAD 0x2 +#define TASK_STOPPED 0x3 +#define TASK_ZOMBIE 0x6 + +struct parasite_ctl; +struct thread_ctx { + k_rtsigset_t sigmask; + user_regs_struct_t regs; +}; + +extern struct parasite_ctl *compel_prepare(int pid); +extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern int compel_prepare_thread(int pid, struct thread_ctx *ctx); + +extern int compel_stop_daemon(struct parasite_ctl *ctl); +extern int compel_cure_remote(struct parasite_ctl *ctl); +extern int compel_cure_local(struct parasite_ctl *ctl); +extern int compel_cure(struct parasite_ctl *ctl); + +#define PARASITE_ARG_SIZE_MIN ( 1 << 12) + +#define compel_parasite_args(ctl, type) \ + ({ \ + void *___ret; \ + BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \ + ___ret = compel_parasite_args_p(ctl); \ + ___ret; \ + }) + +extern void *compel_parasite_args_p(struct parasite_ctl *ctl); +extern void *compel_parasite_args_s(struct parasite_ctl *ctl, int args_size); + +extern int compel_execute_syscall(struct parasite_ctl *ctl, + user_regs_struct_t *regs, const char *code_syscall); +extern int compel_run_in_thread(pid_t pid, unsigned int cmd, + struct parasite_ctl *ctl, + struct thread_ctx *octx); + +/* + * The PTRACE_SYSCALL will trap task twice -- on + * enter into and on exit from syscall. If we trace + * a single task, we may skip half of all getregs + * calls -- on exit we don't need them. + */ +enum trace_flags { + TRACE_ALL, + TRACE_ENTER, + TRACE_EXIT, +}; + +extern int compel_stop_on_syscall(int tasks, int sys_nr, + int sys_nr_compat, enum trace_flags trace); + +extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); + +extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr); + +extern int compel_mode_native(struct parasite_ctl *ctl); + +extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); + +struct rt_sigframe; + +typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) + __attribute__ ((__format__ (__printf__, 3, 4))); + +struct infect_ctx { + int *p_sock; + + /* + * Regs manipulation context. + */ + int (*save_regs)(void *, user_regs_struct_t *, user_fpregs_struct_t *); + int (*make_sigframe)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); + void *regs_arg; + + unsigned long task_size; + unsigned long syscall_ip; /* entry point of infection */ + unsigned long flags; /* fine-tune (e.g. faults) */ + + void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */ + + open_proc_fn open_proc; + + int log_fd; /* fd for parasite code to send messages to */ +}; + +extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); + +#define INFECT_NO_MEMFD 0x1 /* don't use memfd() */ +#define INFECT_FAIL_CONNECT 0x2 /* make parasite connect() fail */ +#define INFECT_NO_BREAKPOINTS 0x4 /* no breakpoints in pie tracking */ +#define INFECT_HAS_COMPAT_SIGRETURN 0x8 + +struct parasite_blob_desc { + const void *mem; + size_t bsize; /* size of the blob */ + size_t size; /* size of the blob with relocs */ + unsigned long parasite_ip_off; + unsigned long addr_cmd_off; + unsigned long addr_arg_off; + compel_reloc_t *relocs; + unsigned int nr_relocs; +}; + +extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); + +typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int compel_get_task_regs(pid_t pid, user_regs_struct_t regs, save_regs_t, void *); + +extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs); + +#endif diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h new file mode 100644 index 000000000..2ab9e1c76 --- /dev/null +++ b/compel/include/uapi/ptrace.h @@ -0,0 +1,76 @@ +#ifndef UAPI_COMPEL_PTRACE_H__ +#define UAPI_COMPEL_PTRACE_H__ + +#include +#include + +#include +#include + +/* some constants for ptrace */ +#ifndef PTRACE_SEIZE +# define PTRACE_SEIZE 0x4206 +#endif + +#ifndef PTRACE_O_SUSPEND_SECCOMP +# define PTRACE_O_SUSPEND_SECCOMP (1 << 21) +#endif + +#ifndef PTRACE_INTERRUPT +# define PTRACE_INTERRUPT 0x4207 +#endif + +#ifndef PTRACE_LISTEN +#define PTRACE_LISTEN 0x4208 +#endif + +#ifndef PTRACE_PEEKSIGINFO +#define PTRACE_PEEKSIGINFO 0x4209 + +/* Read signals from a shared (process wide) queue */ +#define PTRACE_PEEKSIGINFO_SHARED (1 << 0) +#endif + +#ifndef PTRACE_GETREGSET +# define PTRACE_GETREGSET 0x4204 +# define PTRACE_SETREGSET 0x4205 +#endif + +#ifndef PTRACE_GETSIGMASK +# define PTRACE_GETSIGMASK 0x420a +# define PTRACE_SETSIGMASK 0x420b +#endif + +#ifndef PTRACE_SECCOMP_GET_FILTER +#define PTRACE_SECCOMP_GET_FILTER 0x420c +#endif + +#define PTRACE_SEIZE_DEVEL 0x80000000 + +#define PTRACE_EVENT_FORK 1 +#define PTRACE_EVENT_VFORK 2 +#define PTRACE_EVENT_CLONE 3 +#define PTRACE_EVENT_EXEC 4 +#define PTRACE_EVENT_VFORK_DONE 5 +#define PTRACE_EVENT_EXIT 6 +#define PTRACE_EVENT_STOP 128 + +#define PTRACE_O_TRACESYSGOOD 0x00000001 +#define PTRACE_O_TRACEFORK 0x00000002 +#define PTRACE_O_TRACEVFORK 0x00000004 +#define PTRACE_O_TRACECLONE 0x00000008 +#define PTRACE_O_TRACEEXEC 0x00000010 +#define PTRACE_O_TRACEVFORKDONE 0x00000020 +#define PTRACE_O_TRACEEXIT 0x00000040 + +#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) + +extern int suspend_seccomp(pid_t pid); +extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); +extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); +extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); + +extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); +extern int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs); + +#endif /* UAPI_COMPEL_PTRACE_H__ */ diff --git a/compel/src/lib/infect-rpc.c b/compel/src/lib/infect-rpc.c new file mode 100644 index 000000000..265a4ad2f --- /dev/null +++ b/compel/src/lib/infect-rpc.c @@ -0,0 +1,101 @@ +#include "log.h" +#include "common/bug.h" +#include "common/xmalloc.h" +#include "common/lock.h" + +#include "infect.h" +#include "infect-priv.h" +#include "infect-rpc.h" +#include "rpc-pie-priv.h" + +static int __parasite_send_cmd(int sockfd, struct ctl_msg *m) +{ + int ret; + + BUILD_BUG_ON(PARASITE_USER_CMDS < __PARASITE_END_CMDS); + + ret = send(sockfd, m, sizeof(*m), 0); + if (ret == -1) { + pr_perror("Failed to send command %d to daemon", m->cmd); + return -1; + } else if (ret != sizeof(*m)) { + pr_err("Message to daemon is trimmed (%d/%d)\n", + (int)sizeof(*m), ret); + return -1; + } + + pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err); + return 0; +} + +int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m) +{ + int ret; + + pr_debug("Wait for ack %d on daemon socket\n", cmd); + + while (1) { + memzero(m, sizeof(*m)); + + ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL); + if (ret == -1) { + pr_perror("Failed to read ack"); + return -1; + } else if (ret != sizeof(*m)) { + pr_err("Message reply from daemon is trimmed (%d/%d)\n", + (int)sizeof(*m), ret); + return -1; + } + pr_debug("Fetched ack: %d %d %d\n", + m->cmd, m->ack, m->err); + + if (m->cmd != cmd || m->ack != cmd) { + pr_err("Communication error, this is not " + "the ack we expected\n"); + return -1; + } + return 0; + } + + return -1; +} + +int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl) +{ + struct ctl_msg m; + + if (parasite_wait_ack(ctl->tsock, cmd, &m)) + return -1; + + if (m.err != 0) { + pr_err("Command %d for daemon failed with %d\n", + cmd, m.err); + return -1; + } + + return 0; +} + +int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl) +{ + struct ctl_msg m; + + m = ctl_msg_cmd(cmd); + return __parasite_send_cmd(ctl->tsock, &m); +} + +int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl) +{ + int ret; + + ret = compel_rpc_call(cmd, ctl); + if (!ret) + ret = compel_rpc_sync(cmd, ctl); + + return ret; +} + +int compel_rpc_sock(struct parasite_ctl *ctl) +{ + return ctl->tsock; +} diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c new file mode 100644 index 000000000..99dbee062 --- /dev/null +++ b/compel/src/lib/infect-util.c @@ -0,0 +1,21 @@ +#include "log.h" +#include "common/bug.h" +#include "common/lock.h" + +#include "uapi/compel/plugins/plugin-fds.h" + +#include "infect-rpc.h" +#include "infect-util.h" + +int compel_util_send_fd(struct parasite_ctl *ctl, int fd) +{ + int sk; + + sk = compel_rpc_sock(ctl); + if (send_fd(sk, NULL, 0, fd) < 0) { + pr_perror("Can't send file descriptor"); + return -1; + } + return 0; +} + diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c new file mode 100644 index 000000000..6a3724636 --- /dev/null +++ b/compel/src/lib/infect.c @@ -0,0 +1,1262 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "common/bug.h" +#include "common/xmalloc.h" +#include "common/lock.h" +#include "common/page.h" + +#include +#include +#include "asm/ptrace.h" +#include "uapi/compel/plugins/std/syscall.h" +#include "asm/infect-types.h" +#include "asm/sigframe.h" +#include "infect.h" +#include "uapi/compel/ptrace.h" +#include "infect-rpc.h" +#include "infect-priv.h" +#include "infect-util.h" +#include "rpc-pie-priv.h" + +#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ + (size_t)((struct sockaddr_un *) 0)->sun_path) + +#define PARASITE_STACK_SIZE (16 << 10) + +#define PTRACE_EVENT_STOP 128 + +#ifndef SECCOMP_MODE_DISABLED +#define SECCOMP_MODE_DISABLED 0 +#endif + +#ifndef PTRACE_O_SUSPEND_SECCOMP +# define PTRACE_O_SUSPEND_SECCOMP (1 << 21) +#endif + +#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) + +static inline void close_safe(int *pfd) +{ + if (*pfd > -1) { + close(*pfd); + *pfd = -1; + } +} + +int compel_stop_task(int pid) +{ + int ret; + + ret = ptrace(PTRACE_SEIZE, pid, NULL, 0); + if (ret) { + /* + * ptrace API doesn't allow to distinguish + * attaching to zombie from other errors. + * All errors will be handled in compel_wait_task(). + */ + pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno)); + return ret; + } + + /* + * If we SEIZE-d the task stop it before going + * and reading its stat from proc. Otherwise task + * may die _while_ we're doing it and we'll have + * inconsistent seize/state pair. + * + * If task dies after we seize it but before we + * do this interrupt, we'll notice it via proc. + */ + ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL); + if (ret < 0) { + pr_warn("SEIZE %d: can't interrupt task: %s", pid, strerror(errno)); + if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) + pr_perror("Unable to detach from %d", pid); + } + + return ret; +} + +static int skip_sigstop(int pid, int nr_signals) +{ + int i, status, ret; + + /* + * 1) SIGSTOP is queued, but isn't handled yet: + * SGISTOP can't be blocked, so we need to wait when the kernel + * handles this signal. + * + * Otherwise the process will be stopped immediately after + * starting it. + * + * 2) A seized task was stopped: + * PTRACE_SEIZE doesn't affect signal or group stop state. + * Currently ptrace reported that task is in stopped state. + * We need to start task again, and it will be trapped + * immediately, because we sent PTRACE_INTERRUPT to it. + */ + for (i = 0; i < nr_signals; i++) { + ret = ptrace(PTRACE_CONT, pid, 0, 0); + if (ret) { + pr_perror("Unable to start process"); + return -1; + } + + ret = wait4(pid, &status, __WALL, NULL); + if (ret < 0) { + pr_perror("SEIZE %d: can't wait task", pid); + return -1; + } + + if (!WIFSTOPPED(status)) { + pr_err("SEIZE %d: task not stopped after seize\n", pid); + return -1; + } + } + return 0; +} + +static int do_suspend_seccomp(pid_t pid) +{ + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + pr_perror("suspending seccomp failed"); + return -1; + } + + return 0; +} + +/* + * This routine seizes task putting it into a special + * state where we can manipulate the task via ptrace + * interface, and finally we can detach ptrace out of + * of it so the task would not know if it was saddled + * up with someone else. + */ +int compel_wait_task(int pid, int ppid, + int (*get_status)(int pid, struct seize_task_status *), + struct seize_task_status *ss) +{ + siginfo_t si; + int status, nr_sigstop; + int ret = 0, ret2, wait_errno = 0; + + /* + * It's ugly, but the ptrace API doesn't allow to distinguish + * attaching to zombie from other errors. Thus we have to parse + * the target's /proc/pid/stat. Sad, but parse whatever else + * we might need at that early point. + */ + +try_again: + + ret = wait4(pid, &status, __WALL, NULL); + if (ret < 0) { + /* + * wait4() can expectedly fail only in a first time + * if a task is zombie. If we are here from try_again, + * this means that we are tracing this task. + * + * So here we can be only once in this function. + */ + wait_errno = errno; + } + + ret2 = get_status(pid, ss); + if (ret2) + goto err; + + if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) { + if (ss->state != 'Z') { + if (pid == getpid()) + pr_err("The criu itself is within dumped tree.\n"); + else + pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n", + pid, ss->state, ret, wait_errno); + return -1; + } + + if (ret < 0) + return TASK_ZOMBIE; + else + return TASK_DEAD; + } + + if ((ppid != -1) && (ss->ppid != ppid)) { + pr_err("Task pid reused while suspending (%d: %d -> %d)\n", + pid, ppid, ss->ppid); + goto err; + } + + if (!WIFSTOPPED(status)) { + pr_err("SEIZE %d: task not stopped after seize\n", pid); + goto err; + } + + ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si); + if (ret < 0) { + pr_perror("SEIZE %d: can't read signfo", pid); + goto err; + } + + if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) { + /* + * Kernel notifies us about the task being seized received some + * event other than the STOP, i.e. -- a signal. Let the task + * handle one and repeat. + */ + + if (ptrace(PTRACE_CONT, pid, NULL, + (void *)(unsigned long)si.si_signo)) { + pr_perror("Can't continue signal handling, aborting"); + goto err; + } + + ret = 0; + goto try_again; + } + + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && do_suspend_seccomp(pid) < 0) + goto err; + + nr_sigstop = 0; + if (ss->sigpnd & (1 << (SIGSTOP - 1))) + nr_sigstop++; + if (ss->shdpnd & (1 << (SIGSTOP - 1))) + nr_sigstop++; + if (si.si_signo == SIGSTOP) + nr_sigstop++; + + if (nr_sigstop) { + if (skip_sigstop(pid, nr_sigstop)) + goto err_stop; + + return TASK_STOPPED; + } + + if (si.si_signo == SIGTRAP) + return TASK_ALIVE; + else { + pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo); + goto err; + } + +err_stop: + kill(pid, SIGSTOP); +err: + if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) + pr_perror("Unable to detach from %d", pid); + return -1; +} + +int compel_unseize_task(pid_t pid, int orig_st, int st) +{ + pr_debug("\tUnseizing %d into %d\n", pid, st); + + if (st == TASK_DEAD) { + kill(pid, SIGKILL); + return 0; + } else if (st == TASK_STOPPED) { + /* + * Task might have had STOP in queue. We detected such + * guy as TASK_STOPPED, but cleared signal to run the + * parasite code. hus after detach the task will become + * running. That said -- STOP everyone regardless of + * the initial state. + */ + kill(pid, SIGSTOP); + } else if (st == TASK_ALIVE) { + /* + * Same as in the comment above -- there might be a + * task with STOP in queue that would get lost after + * detach, so stop it again. + */ + if (orig_st == TASK_STOPPED) + kill(pid, SIGSTOP); + } else + pr_err("Unknown final state %d\n", st); + + if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) { + pr_perror("Unable to detach from %d", pid); + return -1; + } + + return 0; +} + +static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) +{ + int sun_len; + + saddr->sun_family = AF_UNIX; + snprintf(saddr->sun_path, UNIX_PATH_MAX, + "X/crtools-pr-%d", key); + + sun_len = SUN_LEN(saddr); + *saddr->sun_path = '\0'; + + return sun_len; +} + +static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, + struct parasite_init_args *args) +{ + static int ssock = -1; + + pr_info("Putting tsock into pid %d\n", pid); + args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid()); + + if (ssock == -1) { + ssock = *ctl->ictx.p_sock; + if (ssock == -1) { + pr_err("No socket in ictx\n"); + goto err; + } + + *ctl->ictx.p_sock = -1; + + if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) { + pr_perror("Can't bind socket"); + goto err; + } + + if (listen(ssock, 1)) { + pr_perror("Can't listen on transport socket"); + goto err; + } + } + + /* Check a case when parasite can't initialize a command socket */ + if (ctl->ictx.flags & INFECT_FAIL_CONNECT) + args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid() + 1); + + /* + * Set to -1 to prevent any accidental misuse. The + * only valid user of it is accept_tsock(). + */ + ctl->tsock = -ssock; + return 0; +err: + close_safe(&ssock); + return -1; +} + +static int setup_child_handler(struct parasite_ctl *ctl) +{ + struct sigaction sa = { + .sa_sigaction = ctl->ictx.child_handler, + .sa_flags = SA_SIGINFO | SA_RESTART, + }; + + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGCHLD); + if (sigaction(SIGCHLD, &sa, NULL)) { + pr_perror("Unable to setup SIGCHLD handler"); + return -1; + } + + return 0; +} + +static int restore_child_handler() +{ + struct sigaction sa = { + .sa_handler = SIG_DFL, /* XXX -- should be original? */ + .sa_flags = SA_SIGINFO | SA_RESTART, + }; + + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGCHLD); + if (sigaction(SIGCHLD, &sa, NULL)) { + pr_perror("Unable to setup SIGCHLD handler"); + return -1; + } + + return 0; +} + +static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, + user_regs_struct_t *regs, struct thread_ctx *octx) +{ + k_rtsigset_t block; + + ksigfillset(&block); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + goto err_sig; + } + + parasite_setup_regs(ip, stack, regs); + if (ptrace_set_regs(pid, regs)) { + pr_perror("Can't set registers for %d", pid); + goto err_regs; + } + + if (ptrace(cmd, pid, NULL, NULL)) { + pr_perror("Can't run parasite at %d", pid); + goto err_cont; + } + + return 0; + +err_cont: + if (ptrace_set_regs(pid, &octx->regs)) + pr_perror("Can't restore regs for %d", pid); +err_regs: + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask)) + pr_perror("Can't restore sigmask for %d", pid); +err_sig: + return -1; +} + +static int restore_thread_ctx(int pid, struct thread_ctx *ctx) +{ + int ret = 0; + + if (ptrace_set_regs(pid, &ctx->regs)) { + pr_perror("Can't restore registers (pid: %d)", pid); + ret = -1; + } + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { + pr_perror("Can't block signals"); + ret = -1; + } + + return ret; +} + + +/* we run at @regs->ip */ +static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, + user_regs_struct_t *regs, + struct thread_ctx *octx) +{ + siginfo_t siginfo; + int status; + int ret = -1; + + /* + * Most ideas are taken from Tejun Heo's parasite thread + * https://code.google.com/p/ptrace-parasite/ + */ + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("Waited pid mismatch (pid: %d)", pid); + goto err; + } + + if (!WIFSTOPPED(status)) { + pr_err("Task is still running (pid: %d)\n", pid); + goto err; + } + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) { + pr_perror("Can't get siginfo (pid: %d)", pid); + goto err; + } + + if (ptrace_get_regs(pid, regs)) { + pr_perror("Can't obtain registers (pid: %d)", pid); + goto err; + } + + if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) { + pr_debug("** delivering signal %d si_code=%d\n", + siginfo.si_signo, siginfo.si_code); + + pr_err("Unexpected %d task interruption, aborting\n", pid); + goto err; + } + + /* + * We've reached this point if int3 is triggered inside our + * parasite code. So we're done. + */ + ret = 0; +err: + if (restore_thread_ctx(pid, octx)) + ret = -1; + + return ret; +} + + +int compel_execute_syscall(struct parasite_ctl *ctl, + user_regs_struct_t *regs, const char *code_syscall) +{ + pid_t pid = ctl->rpid; + int err; + uint8_t code_orig[BUILTIN_SYSCALL_SIZE]; + + /* + * Inject syscall instruction and remember original code, + * we will need it to restore original program content. + */ + memcpy(code_orig, code_syscall, sizeof(code_orig)); + if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip, + (void *)code_orig, sizeof(code_orig))) { + pr_err("Can't inject syscall blob (pid: %d)\n", pid); + return -1; + } + + err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); + if (!err) + err = parasite_trap(ctl, pid, regs, &ctl->orig); + + if (ptrace_poke_area(pid, (void *)code_orig, + (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) { + pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid); + err = -1; + } + + return err; +} + +static int accept_tsock(struct parasite_ctl *ctl) +{ + int sock; + int ask = -ctl->tsock; /* this '-' is explained above */ + + sock = accept(ask, NULL, 0); + if (sock < 0) { + pr_perror("Can't accept connection to the transport socket"); + close(ask); + return -1; + } + + ctl->tsock = sock; + return 0; +} + +static int parasite_init_daemon(struct parasite_ctl *ctl) +{ + struct parasite_init_args *args; + pid_t pid = ctl->rpid; + user_regs_struct_t regs; + struct ctl_msg m = { }; + + *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON; + + args = compel_parasite_args(ctl, struct parasite_init_args); + + args->sigframe = (uintptr_t)ctl->rsigframe; + args->log_level = compel_log_get_loglevel(); + + futex_set(&args->daemon_connected, 0); + + if (prepare_tsock(ctl, pid, args)) + goto err; + + /* after this we can catch parasite errors in chld handler */ + if (setup_child_handler(ctl)) + goto err; + + regs = ctl->orig.regs; + if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) + goto err; + + futex_wait_while_eq(&args->daemon_connected, 0); + if (futex_get(&args->daemon_connected) != 1) { + errno = -(int)futex_get(&args->daemon_connected); + pr_perror("Unable to connect a transport socket"); + goto err; + } + + if (accept_tsock(ctl) < 0) + goto err; + + if (compel_util_send_fd(ctl, ctl->ictx.log_fd)) + goto err; + + pr_info("Wait for parasite being daemonized...\n"); + + if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) { + pr_err("Can't switch parasite %d to daemon mode %d\n", + pid, m.err); + goto err; + } + + ctl->sigreturn_addr = (void*)(uintptr_t)args->sigreturn_addr; + ctl->daemonized = true; + pr_info("Parasite %d has been switched to daemon mode\n", pid); + return 0; +err: + return -1; +} + +static int parasite_start_daemon(struct parasite_ctl *ctl) +{ + pid_t pid = ctl->rpid; + struct infect_ctx *ictx = &ctl->ictx; + + /* + * Get task registers before going daemon, since the + * compel_get_task_regs needs to call ptrace on _stopped_ task, + * while in daemon it is not such. + */ + + if (compel_get_task_regs(pid, ctl->orig.regs, ictx->save_regs, ictx->regs_arg)) { + pr_err("Can't obtain regs for thread %d\n", pid); + return -1; + } + + if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) + return -1; + + if (parasite_init_daemon(ctl)) + return -1; + + return 0; +} + +static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) +{ + int fd; + + ctl->remote_map = remote_mmap(ctl, NULL, size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (!ctl->remote_map) { + pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->rpid); + return -1; + } + + ctl->map_length = round_up(size, page_size()); + + fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%p-%p", + ctl->remote_map, ctl->remote_map + ctl->map_length); + if (fd < 0) + return -1; + + ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FILE, fd, 0); + close(fd); + + if (ctl->local_map == MAP_FAILED) { + ctl->local_map = NULL; + pr_perror("Can't map remote parasite map"); + return -1; + } + + return 0; +} + +static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) +{ + void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; + uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; + pid_t pid = ctl->rpid; + unsigned long sret = -ENOSYS; + int ret, fd, lfd; + bool __maybe_unused compat_task = !compel_mode_native(ctl); + + if (ctl->ictx.flags & INFECT_NO_MEMFD) + return 1; + + BUILD_BUG_ON(sizeof(orig_code) < sizeof(long)); + + if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) { + pr_err("Can't inject memfd args (pid: %d)\n", pid); + return -1; + } + + ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret, + (unsigned long)where, 0, 0, 0, 0, 0); + + if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { + fd = (int)(long)sret; + if (fd >= 0) + compel_syscall(ctl, __NR(close, compat_task), &sret, + fd, 0, 0, 0, 0, 0); + pr_err("Can't restore memfd args (pid: %d)\n", pid); + return -1; + } + + if (ret < 0) + return ret; + + fd = (int)(long)sret; + if (fd == -ENOSYS) + return 1; + if (fd < 0) + return fd; + + ctl->map_length = round_up(size, page_size()); + lfd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "fd/%d", fd); + if (lfd < 0) + goto err_cure; + + if (ftruncate(lfd, ctl->map_length) < 0) { + pr_perror("Fail to truncate memfd for parasite"); + goto err_cure; + } + + ctl->remote_map = remote_mmap(ctl, NULL, size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FILE | MAP_SHARED, fd, 0); + if (!ctl->remote_map) { + pr_err("Can't rmap memfd for parasite blob\n"); + goto err_curef; + } + + ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FILE, lfd, 0); + if (ctl->local_map == MAP_FAILED) { + ctl->local_map = NULL; + pr_perror("Can't lmap memfd for parasite blob"); + goto err_curef; + } + + compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + close(lfd); + + pr_info("Set up parasite blob using memfd\n"); + return 0; + +err_curef: + close(lfd); +err_cure: + compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + return -1; +} + +void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs) +{ + size_t i, j; + + for (i = 0, j = 0; i < nr_relocs; i++) { + if (elf_relocs[i].type & COMPEL_TYPE_LONG) { + long *where = mem + elf_relocs[i].offset; + long *p = mem + size; + + if (elf_relocs[i].type & COMPEL_TYPE_GOTPCREL) { + int *value = (int *)where; + int rel; + + p[j] = (long)vbase + elf_relocs[i].value; + rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend; + + *value = rel; + j++; + } else + *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; + } else if (elf_relocs[i].type & COMPEL_TYPE_INT) { + int *where = (mem + elf_relocs[i].offset); + *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; + } else + BUG(); + } +} + +int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) +{ + int ret; + + ret = parasite_memfd_exchange(ctl, size); + if (ret == 1) { + pr_info("MemFD parasite doesn't work, goto legacy mmap\n"); + ret = parasite_mmap_exchange(ctl, size); + } + return ret; +} + +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + int ret; + unsigned long p, map_exchange_size, parasite_size = 0; + + if (ctl->ictx.log_fd < 0) + goto err; + + if (!arch_can_dump_task(ctl)) + goto err; + + /* + * Inject a parasite engine. Ie allocate memory inside alien + * space and copy engine code there. Then re-map the engine + * locally, so we will get an easy way to access engine memory + * without using ptrace at all. + */ + + parasite_size = ctl->pblob.size; + + ctl->args_size = round_up(args_size, PAGE_SIZE); + parasite_size += ctl->args_size; + + map_exchange_size = parasite_size; + map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE; + if (nr_threads > 1) + map_exchange_size += PARASITE_STACK_SIZE; + + ret = compel_map_exchange(ctl, map_exchange_size); + if (ret) + goto err; + + pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map); + + ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.parasite_ip_off); + ctl->addr_cmd = ctl->local_map + ctl->pblob.addr_cmd_off; + ctl->addr_args = ctl->local_map + ctl->pblob.addr_arg_off; + + memcpy(ctl->local_map, ctl->pblob.mem, ctl->pblob.size); + if (ctl->pblob.nr_relocs) + compel_relocs_apply(ctl->local_map, ctl->remote_map, ctl->pblob.bsize, + ctl->pblob.relocs, ctl->pblob.nr_relocs); + + p = parasite_size; + + ctl->rsigframe = ctl->remote_map + p; + ctl->sigframe = ctl->local_map + p; + + p += RESTORE_STACK_SIGFRAME; + p += PARASITE_STACK_SIZE; + ctl->rstack = ctl->remote_map + p; + + if (nr_threads > 1) { + p += PARASITE_STACK_SIZE; + ctl->r_thread_stack = ctl->remote_map + p; + } + + if (parasite_start_daemon(ctl)) + goto err; + + return 0; + +err: + return -1; +} + +int compel_prepare_thread(int pid, struct thread_ctx *ctx) +{ + if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { + pr_perror("can't get signal blocking mask for %d", pid); + return -1; + } + + if (ptrace_get_regs(pid, &ctx->regs)) { + pr_perror("Can't obtain registers (pid: %d)", pid); + return -1; + } + + return 0; +} + +struct parasite_ctl *compel_prepare(int pid) +{ + struct parasite_ctl *ctl = NULL; + + /* + * Control block early setup. + */ + ctl = xzalloc(sizeof(*ctl)); + if (!ctl) { + pr_err("Parasite control block allocation failed (pid: %d)\n", pid); + goto err; + } + + ctl->tsock = -1; + ctl->ictx.log_fd = -1; + + if (compel_prepare_thread(pid, &ctl->orig)) + goto err; + + ctl->rpid = pid; + + BUILD_BUG_ON(PARASITE_START_AREA_MIN < BUILTIN_SYSCALL_SIZE + MEMFD_FNAME_SZ); + + return ctl; + +err: + xfree(ctl); + return NULL; +} + +static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) +{ + void *addr = (void *) REG_IP(*regs); + return addr >= ctl->remote_map && + addr < ctl->remote_map + ctl->map_length; +} + +static int parasite_fini_seized(struct parasite_ctl *ctl) +{ + pid_t pid = ctl->rpid; + user_regs_struct_t regs; + int status, ret = 0; + enum trace_flags flag; + + /* stop getting chld from parasite -- we're about to step-by-step it */ + if (restore_child_handler()) + return -1; + + /* Start to trace syscalls for each thread */ + if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) { + pr_perror("Unable to interrupt the process"); + return -1; + } + + pr_debug("Waiting for %d to trap\n", pid); + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("Waited pid mismatch (pid: %d)", pid); + return -1; + } + + pr_debug("Daemon %d exited trapping\n", pid); + if (!WIFSTOPPED(status)) { + pr_err("Task is still running (pid: %d)\n", pid); + return -1; + } + + ret = ptrace_get_regs(pid, ®s); + if (ret) { + pr_perror("Unable to get registers"); + return -1; + } + + if (!task_in_parasite(ctl, ®s)) { + pr_err("The task is not in parasite code\n"); + return -1; + } + + ret = compel_rpc_call(PARASITE_CMD_FINI, ctl); + close_safe(&ctl->tsock); + if (ret) + return -1; + + /* Go to sigreturn as closer as we can */ + ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, + ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + if (ret < 0) + return ret; + + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), + __NR(rt_sigreturn, 1), flag)) + return -1; + + if (ptrace_flush_breakpoints(pid)) + return -1; + + /* + * All signals are unblocked now. The kernel notifies about leaving + * syscall before starting to deliver signals. All parasite code are + * executed with blocked signals, so we can sefly unmap a parasite blob. + */ + + return 0; +} + +int compel_stop_daemon(struct parasite_ctl *ctl) +{ + if (ctl->daemonized) { + /* + * Looks like a previous attempt failed, we should do + * nothing in this case. parasite will try to cure itself. + */ + if (ctl->tsock < 0) + return -1; + + if (parasite_fini_seized(ctl)) { + close_safe(&ctl->tsock); + return -1; + } + } + + ctl->daemonized = false; + + return 0; +} + +int compel_cure_remote(struct parasite_ctl *ctl) +{ + if (compel_stop_daemon(ctl)) + return -1; + + if (!ctl->remote_map) + return 0; + + /* Unseizing task with parasite -- it does it himself */ + if (ctl->addr_cmd) { + struct parasite_unmap_args *args; + + *ctl->addr_cmd = PARASITE_CMD_UNMAP; + + args = compel_parasite_args(ctl, struct parasite_unmap_args); + args->parasite_start = ctl->remote_map; + args->parasite_len = ctl->map_length; + if (compel_unmap(ctl, ctl->parasite_ip)) + return -1; + } else { + unsigned long ret; + + compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, + (unsigned long)ctl->remote_map, ctl->map_length, + 0, 0, 0, 0); + if (ret) { + pr_err("munmap for remote map %p, %lu returned %lu\n", + ctl->remote_map, ctl->map_length, ret); + return -1; + } + } + + return 0; +} + +int compel_cure_local(struct parasite_ctl *ctl) +{ + int ret = 0; + + if (ctl->local_map) { + if (munmap(ctl->local_map, ctl->map_length)) { + pr_err("munmap failed (pid: %d)\n", ctl->rpid); + ret = -1; + } + } + + free(ctl); + return ret; +} + +int compel_cure(struct parasite_ctl *ctl) +{ + int ret; + + ret = compel_cure_remote(ctl); + if (!ret) + ret = compel_cure_local(ctl); + + return ret; +} + +void *compel_parasite_args_p(struct parasite_ctl *ctl) +{ + return ctl->addr_args; +} + +void *compel_parasite_args_s(struct parasite_ctl *ctl, int args_size) +{ + BUG_ON(args_size > ctl->args_size); + return compel_parasite_args_p(ctl); +} + +int compel_run_in_thread(pid_t pid, unsigned int cmd, + struct parasite_ctl *ctl, + struct thread_ctx *octx) +{ + void *stack = ctl->r_thread_stack; + user_regs_struct_t regs = octx->regs; + int ret; + + *ctl->addr_cmd = cmd; + + ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); + if (ret == 0) + ret = parasite_trap(ctl, pid, ®s, octx); + if (ret == 0) + ret = (int)REG_RES(regs); + + if (ret) + pr_err("Parasite exited with %d\n", ret); + + return ret; +} + +/* + * compel_unmap() is used for unmapping parasite and restorer blobs. + * A blob can contain code for unmapping itself, so the porcess is + * trapped on the exit from the munmap syscall. + */ +int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) +{ + user_regs_struct_t regs = ctl->orig.regs; + pid_t pid = ctl->rpid; + int ret = -1; + + ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); + if (ret) + goto err; + + ret = compel_stop_on_syscall(1, __NR(munmap, 0), + __NR(munmap, 1), TRACE_ENTER); + + if (restore_thread_ctx(pid, &ctl->orig)) + ret = -1; +err: + return ret; +} + +int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +{ + int ret; + + if (no_bp) { + pr_debug("Force no-breakpoints restore\n"); + ret = 0; + } else + ret = ptrace_set_breakpoint(pid, addr); + if (ret < 0) + return ret; + + if (ret > 0) { + /* + * PIE will stop on a breakpoint, next + * stop after that will be syscall enter. + */ + *tf = TRACE_EXIT; + return 0; + } + + /* + * No breakpoints available -- start tracing it + * in a per-syscall manner. + */ + ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); + if (ret) { + pr_perror("Unable to restart the %d process", pid); + return -1; + } + + *tf = TRACE_ENTER; + return 0; +} + +static bool task_is_trapped(int status, pid_t pid) +{ + if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + return true; + + pr_err("Task %d is in unexpected state: %x\n", pid, status); + if (WIFEXITED(status)) + pr_err("Task exited with %d\n", WEXITSTATUS(status)); + if (WIFSIGNALED(status)) + pr_err("Task signaled with %d: %s\n", + WTERMSIG(status), strsignal(WTERMSIG(status))); + if (WIFSTOPPED(status)) + pr_err("Task stopped with %d: %s\n", + WSTOPSIG(status), strsignal(WSTOPSIG(status))); + if (WIFCONTINUED(status)) + pr_err("Task continued\n"); + + return false; +} + +static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, + const int sys_nr, const int sys_nr_compat) +{ + const char *mode = user_regs_native(regs) ? "native" : "compat"; + int req_sysnr = user_regs_native(regs) ? sys_nr : sys_nr_compat; + + pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n", + pid, mode, REG_SYSCALL_NR(*regs), req_sysnr); + + return (REG_SYSCALL_NR(*regs) == req_sysnr); +} + +/* + * Trap tasks on the exit from the specified syscall + * + * tasks - number of processes, which should be trapped + * sys_nr - the required syscall number + * sys_nr_compat - the required compatible syscall number + */ +int compel_stop_on_syscall(int tasks, + const int sys_nr, const int sys_nr_compat, + enum trace_flags trace) +{ + user_regs_struct_t regs; + int status, ret; + pid_t pid; + + if (tasks > 1) + trace = TRACE_ALL; + + /* Stop all threads on the enter point in sys_rt_sigreturn */ + while (tasks) { + pid = wait4(-1, &status, __WALL, NULL); + if (pid == -1) { + pr_perror("wait4 failed"); + return -1; + } + + if (!task_is_trapped(status, pid)) + return -1; + + pr_debug("%d was trapped\n", pid); + + if (trace == TRACE_EXIT) { + trace = TRACE_ENTER; + pr_debug("`- Expecting exit\n"); + goto goon; + } + if (trace == TRACE_ENTER) + trace = TRACE_EXIT; + + ret = ptrace_get_regs(pid, ®s); + if (ret) { + pr_perror("ptrace"); + return -1; + } + + if (is_required_syscall(®s, pid, sys_nr, sys_nr_compat)) { + /* + * The process is going to execute the required syscall, + * the next stop will be on the exit from this syscall + */ + ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + + pid = wait4(pid, &status, __WALL, NULL); + if (pid == -1) { + pr_perror("wait4 failed"); + return -1; + } + + if (!task_is_trapped(status, pid)) + return -1; + + pr_debug("%d was stopped\n", pid); + tasks--; + continue; + } +goon: + ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + } + + return 0; +} + +int compel_mode_native(struct parasite_ctl *ctl) +{ + return user_regs_native(&ctl->orig.regs); +} + +k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl) +{ + return &ctl->orig.sigmask; +} + +struct infect_ctx *compel_infect_ctx(struct parasite_ctl *ctl) +{ + return &ctl->ictx; +} + +struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *ctl) +{ + return &ctl->pblob; +} diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c new file mode 100644 index 000000000..c2991b5d8 --- /dev/null +++ b/compel/src/lib/ptrace.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/compiler.h" + +#include "uapi/compel/asm/infect-types.h" +#include "uapi/compel/ptrace.h" + +#include "log.h" + +int suspend_seccomp(pid_t pid) +{ + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + pr_perror("suspending seccomp failed"); + return -1; + } + + return 0; +} + +int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) +{ + unsigned long w; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *d = dst, *a = addr; + d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); + if (d[w] == -1U && errno) + goto err; + } + return 0; +err: + return -2; +} + +int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) +{ + unsigned long w; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *s = src, *a = addr; + if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) + goto err; + } + return 0; +err: + return -2; +} + +/* don't swap big space, it might overflow the stack */ +int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) +{ + void *t = alloca(bytes); + + if (ptrace_peek_area(pid, t, dst, bytes)) + return -1; + + if (ptrace_poke_area(pid, src, dst, bytes)) { + if (ptrace_poke_area(pid, t, dst, bytes)) + return -2; + return -1; + } + + memcpy(src, t, bytes); + + return 0; +} + +int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) { + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +int __attribute__((weak)) ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} -- cgit v1.2.3