diff options
author | Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com> | 2022-04-08 23:03:37 +0300 |
---|---|---|
committer | Andrei Vagin <avagin@gmail.com> | 2022-04-29 03:53:52 +0300 |
commit | f81e3062ca6a4a0376b63b31749773f8f0c09949 (patch) | |
tree | 4ec68be654f515c544e1b476269c2ccd56ece042 | |
parent | bd9ee325540c5dc2db96582ef6d5cb5c0b78d065 (diff) |
rseq: initial support
Support basic rseq C/R scenario. Assume that:
- there are no processes with IP inside the rseq critical section (CS)
- kernel has ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support
On dump:
1. use ptrace(PTRACE_GET_RSEQ_CONFIGURATION) to get
struct rseq pointer, rseq size and signature from the kernel.
2. save to the image
On restore:
1. get rseq ptr, size, signature from the image
2. register it back using rseq() from the restorer parasite
Fixes: #1696
Reported-by: Radostin Stoyanov <radostin@redhat.com>
Suggested-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
-rw-r--r-- | criu/cr-dump.c | 70 | ||||
-rw-r--r-- | criu/cr-restore.c | 22 | ||||
-rw-r--r-- | criu/include/linux/rseq.h | 137 | ||||
-rw-r--r-- | criu/include/restorer.h | 7 | ||||
-rw-r--r-- | criu/pie/parasite.c | 2 | ||||
-rw-r--r-- | criu/pie/restorer.c | 25 | ||||
-rw-r--r-- | images/Makefile | 1 | ||||
-rw-r--r-- | images/core.proto | 2 | ||||
-rw-r--r-- | images/rseq.proto | 9 |
9 files changed, 275 insertions, 0 deletions
diff --git a/criu/cr-dump.c b/criu/cr-dump.c index c6678b450..02a9ea4bb 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -45,6 +45,7 @@ #include "proc_parse.h" #include "parasite.h" #include "parasite-syscall.h" +#include "compel/ptrace.h" #include "files.h" #include "files-reg.h" #include "shmem.h" @@ -1003,6 +1004,69 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } +static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) +{ + struct __ptrace_rseq_configuration rseq; + RseqEntry *rseqe = NULL; + int ret; + + /* + * If we are here it means that rseq() syscall is supported, + * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported, + * we can just fail dump here. But this is bad idea, IMHO. + * + * So, we will try to detect if victim process was used rseq(). + * See check_rseq() and check_thread_rseq() functions. + */ + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + + ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq); + if (ret != sizeof(rseq)) { + pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); + return -1; + } + + if (rseq.flags != 0) { + pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, + rseq.flags); + return -1; + } + + pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, + rseq.signature); + + rseqe = xmalloc(sizeof(*rseqe)); + if (!rseqe) + return -1; + + rseq_entry__init(rseqe); + + rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer; + rseqe->rseq_abi_size = rseq.rseq_abi_size; + rseqe->signature = rseq.signature; + + *rseqep = rseqe; + + return 0; +} + +static int dump_task_rseq(pid_t pid, struct pstree_item *item) +{ + int i; + + /* if rseq() syscall isn't supported then nothing to dump */ + if (!kdat.has_rseq) + return 0; + + for (i = 0; i < item->nr_threads; i++) { + if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry)) + return -1; + } + + return 0; +} + static struct proc_pid_stat pps_buf; static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) @@ -1298,6 +1362,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } + ret = dump_task_rseq(pid, item); + if (ret) { + pr_err("Dump %d rseq failed %d\n", pid, ret); + goto err; + } + parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index a398927ad..ed576fc55 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2994,6 +2994,24 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) return 0; } +static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) +{ + /* compatibility with older CRIU versions */ + if (!tc->rseq_entry) + return 0; + + rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer; + rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size; + rseq->signature = tc->rseq_entry->signature; + + if (rseq->rseq_abi_pointer && !kdat.has_rseq) { + pr_err("rseq: can't restore as kernel doesn't support it\n"); + return -1; + } + + return 0; +} + static rlim_t decode_rlim(rlim_t ival) { return ival == -1 ? RLIM_INFINITY : ival; @@ -3704,6 +3722,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); + if (ret) + goto err; + rst_reloc_creds(&thread_args[i], &creds_pos_next); thread_args[i].futex_rla = tcore->thread_core->futex_rla; diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h new file mode 100644 index 000000000..b227aefdf --- /dev/null +++ b/criu/include/linux/rseq.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_RSEQ_H +#define _UAPI_LINUX_RSEQ_H + +/* + * linux/rseq.h + * + * Restartable sequences system call API + * + * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + */ + +#include <linux/types.h> +#include <asm/byteorder.h> + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), +}; + +enum rseq_cs_flags_bit { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, +}; + +enum rseq_cs_flags { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), +}; + +/* + * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always + * contained within a single cache-line. It is usually declared as + * link-time constant data. + */ +struct rseq_cs { + /* Version of this structure. */ + __u32 version; + /* enum rseq_cs_flags */ + __u32 flags; + __u64 start_ip; + /* Offset from start_ip. */ + __u64 post_commit_offset; + __u64 abort_ip; +} __attribute__((aligned(4 * sizeof(__u64)))); + +/* + * We have to have our own copy of struct rseq definition because + * of breaking UAPI change: + * https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=bfdf4e6208051ed7165b2e92035b4bf11f43eb63 + */ +/* + * struct rseq is aligned on 4 * 8 bytes to ensure it is always + * contained within a single cache-line. + * + * A single struct rseq per thread is allowed. + */ +struct criu_rseq { + /* + * Restartable sequences cpu_id_start field. Updated by the + * kernel. Read by user-space with single-copy atomicity + * semantics. This field should only be read by the thread which + * registered this data structure. Aligned on 32-bit. Always + * contains a value in the range of possible CPUs, although the + * value may not be the actual current CPU (e.g. if rseq is not + * initialized). This CPU number value should always be compared + * against the value of the cpu_id field before performing a rseq + * commit or returning a value read from a data structure indexed + * using the cpu_id_start value. + */ + __u32 cpu_id_start; + /* + * Restartable sequences cpu_id field. Updated by the kernel. + * Read by user-space with single-copy atomicity semantics. This + * field should only be read by the thread which registered this + * data structure. Aligned on 32-bit. Values + * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED + * have a special semantic: the former means "rseq uninitialized", + * and latter means "rseq initialization failed". This value is + * meant to be read within rseq critical sections and compared + * with the cpu_id_start value previously read, before performing + * the commit instruction, or read and compared with the + * cpu_id_start value before returning a value loaded from a data + * structure indexed using the cpu_id_start value. + */ + __u32 cpu_id; + /* + * Restartable sequences rseq_cs field. + * + * Contains NULL when no critical section is active for the current + * thread, or holds a pointer to the currently active struct rseq_cs. + * + * Updated by user-space, which sets the address of the currently + * active rseq_cs at the beginning of assembly instruction sequence + * block, and set to NULL by the kernel when it restarts an assembly + * instruction sequence block, as well as when the kernel detects that + * it is preempting or delivering a signal outside of the range + * targeted by the rseq_cs. Also needs to be set to NULL by user-space + * before reclaiming memory that contains the targeted struct rseq_cs. + * + * Read and set by the kernel. Set by user-space with single-copy + * atomicity semantics. This field should only be updated by the + * thread which registered this data structure. Aligned on 64-bit. + * + * 32-bit architectures should update the low order bits of the + * rseq_cs field, leaving the high order bits initialized to 0. + */ + __u64 rseq_cs; + + /* + * Restartable sequences flags field. + * + * This field should only be updated by the thread which + * registered this data structure. Read by the kernel. + * Mainly used for single-stepping through rseq critical sections + * with debuggers. + * + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * Inhibit instruction sequence block restart on preemption + * for this thread. + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * Inhibit instruction sequence block restart on signal + * delivery for this thread. + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE + * Inhibit instruction sequence block restart on migration for + * this thread. + */ + __u32 flags; +} __attribute__((aligned(4 * sizeof(__u64)))); + +#endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 308a0b79b..2e21da522 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -44,6 +44,12 @@ struct rst_sched_param { int prio; }; +struct rst_rseq_param { + u64 rseq_abi_pointer; + u32 rseq_abi_size; + u32 signature; +}; + struct restore_posix_timer { struct str_posix_timer spt; struct itimerspec val; @@ -98,6 +104,7 @@ struct thread_restore_args { struct task_restore_args *ta; tls_t tls; + struct rst_rseq_param rseq; siginfo_t *siginfo; unsigned int siginfo_n; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index f84652b8c..e17321894 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -8,6 +8,8 @@ #include <sys/ioctl.h> #include <sys/uio.h> +#include "linux/rseq.h" + #include "common/config.h" #include "int.h" #include "types.h" diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 422a12ff8..376a5025d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -425,6 +425,28 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; } +static int restore_rseq(struct rst_rseq_param *rseq) +{ + int ret; + + if (!rseq->rseq_abi_pointer) { + pr_debug("rseq: nothing to restore\n"); + return 0; + } + + pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer, + rseq->signature); + + ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature); + if (ret) { + pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer, + (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret); + return -1; + } + + return 0; +} + static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) { unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; @@ -549,6 +571,9 @@ static int restore_thread_common(struct thread_restore_args *args) restore_tls(&args->tls); + if (restore_rseq(&args->rseq)) + return -1; + return 0; } diff --git a/images/Makefile b/images/Makefile index 2eaeb7cad..004e22ec3 100644 --- a/images/Makefile +++ b/images/Makefile @@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o +proto-obj-y += rseq.o CFLAGS += -iquote $(obj)/ diff --git a/images/core.proto b/images/core.proto index b713119f2..35079f366 100644 --- a/images/core.proto +++ b/images/core.proto @@ -14,6 +14,7 @@ import "timer.proto"; import "creds.proto"; import "sa.proto"; import "siginfo.proto"; +import "rseq.proto"; import "opts.proto"; @@ -101,6 +102,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; + optional rseq_entry rseq_entry = 15; } message task_rlimits_entry { diff --git a/images/rseq.proto b/images/rseq.proto new file mode 100644 index 000000000..be2800468 --- /dev/null +++ b/images/rseq.proto @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +message rseq_entry { + required uint64 rseq_abi_pointer = 1; + required uint32 rseq_abi_size = 2; + required uint32 signature = 3; +} |