Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>2022-04-08 23:03:37 +0300
committerAndrei Vagin <avagin@gmail.com>2022-04-29 03:53:52 +0300
commitf81e3062ca6a4a0376b63b31749773f8f0c09949 (patch)
tree4ec68be654f515c544e1b476269c2ccd56ece042
parentbd9ee325540c5dc2db96582ef6d5cb5c0b78d065 (diff)
rseq: initial support
Support basic rseq C/R scenario. Assume that: - there are no processes with IP inside the rseq critical section (CS) - kernel has ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support On dump: 1. use ptrace(PTRACE_GET_RSEQ_CONFIGURATION) to get struct rseq pointer, rseq size and signature from the kernel. 2. save to the image On restore: 1. get rseq ptr, size, signature from the image 2. register it back using rseq() from the restorer parasite Fixes: #1696 Reported-by: Radostin Stoyanov <radostin@redhat.com> Suggested-by: Florian Weimer <fweimer@redhat.com> Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
-rw-r--r--criu/cr-dump.c70
-rw-r--r--criu/cr-restore.c22
-rw-r--r--criu/include/linux/rseq.h137
-rw-r--r--criu/include/restorer.h7
-rw-r--r--criu/pie/parasite.c2
-rw-r--r--criu/pie/restorer.c25
-rw-r--r--images/Makefile1
-rw-r--r--images/core.proto2
-rw-r--r--images/rseq.proto9
9 files changed, 275 insertions, 0 deletions
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index c6678b450..02a9ea4bb 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -45,6 +45,7 @@
#include "proc_parse.h"
#include "parasite.h"
#include "parasite-syscall.h"
+#include "compel/ptrace.h"
#include "files.h"
#include "files-reg.h"
#include "shmem.h"
@@ -1003,6 +1004,69 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
return 0;
}
+static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep)
+{
+ struct __ptrace_rseq_configuration rseq;
+ RseqEntry *rseqe = NULL;
+ int ret;
+
+ /*
+ * If we are here it means that rseq() syscall is supported,
+ * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported,
+ * we can just fail dump here. But this is bad idea, IMHO.
+ *
+ * So, we will try to detect if victim process was used rseq().
+ * See check_rseq() and check_thread_rseq() functions.
+ */
+ if (!kdat.has_ptrace_get_rseq_conf)
+ return 0;
+
+ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq);
+ if (ret != sizeof(rseq)) {
+ pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret);
+ return -1;
+ }
+
+ if (rseq.flags != 0) {
+ pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid,
+ rseq.flags);
+ return -1;
+ }
+
+ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer,
+ rseq.signature);
+
+ rseqe = xmalloc(sizeof(*rseqe));
+ if (!rseqe)
+ return -1;
+
+ rseq_entry__init(rseqe);
+
+ rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer;
+ rseqe->rseq_abi_size = rseq.rseq_abi_size;
+ rseqe->signature = rseq.signature;
+
+ *rseqep = rseqe;
+
+ return 0;
+}
+
+static int dump_task_rseq(pid_t pid, struct pstree_item *item)
+{
+ int i;
+
+ /* if rseq() syscall isn't supported then nothing to dump */
+ if (!kdat.has_rseq)
+ return 0;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry))
+ return -1;
+ }
+
+ return 0;
+}
+
static struct proc_pid_stat pps_buf;
static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item)
@@ -1298,6 +1362,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie)
goto err;
}
+ ret = dump_task_rseq(pid, item);
+ if (ret) {
+ pr_err("Dump %d rseq failed %d\n", pid, ret);
+ goto err;
+ }
+
parasite_ctl = parasite_infect_seized(pid, item, &vmas);
if (!parasite_ctl) {
pr_err("Can't infect (pid: %d) with parasite\n", pid);
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index a398927ad..ed576fc55 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -2994,6 +2994,24 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc)
return 0;
}
+static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc)
+{
+ /* compatibility with older CRIU versions */
+ if (!tc->rseq_entry)
+ return 0;
+
+ rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer;
+ rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size;
+ rseq->signature = tc->rseq_entry->signature;
+
+ if (rseq->rseq_abi_pointer && !kdat.has_rseq) {
+ pr_err("rseq: can't restore as kernel doesn't support it\n");
+ return -1;
+ }
+
+ return 0;
+}
+
static rlim_t decode_rlim(rlim_t ival)
{
return ival == -1 ? RLIM_INFINITY : ival;
@@ -3704,6 +3722,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
core_get_tls(tcore, &thread_args[i].tls);
+ ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core);
+ if (ret)
+ goto err;
+
rst_reloc_creds(&thread_args[i], &creds_pos_next);
thread_args[i].futex_rla = tcore->thread_core->futex_rla;
diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h
new file mode 100644
index 000000000..b227aefdf
--- /dev/null
+++ b/criu/include/linux/rseq.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RSEQ_H
+#define _UAPI_LINUX_RSEQ_H
+
+/*
+ * linux/rseq.h
+ *
+ * Restartable sequences system call API
+ *
+ * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+enum rseq_cpu_id_state {
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+};
+
+enum rseq_cs_flags_bit {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+};
+
+/*
+ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line. It is usually declared as
+ * link-time constant data.
+ */
+struct rseq_cs {
+ /* Version of this structure. */
+ __u32 version;
+ /* enum rseq_cs_flags */
+ __u32 flags;
+ __u64 start_ip;
+ /* Offset from start_ip. */
+ __u64 post_commit_offset;
+ __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+/*
+ * We have to have our own copy of struct rseq definition because
+ * of breaking UAPI change:
+ * https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=bfdf4e6208051ed7165b2e92035b4bf11f43eb63
+ */
+/*
+ * struct rseq is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line.
+ *
+ * A single struct rseq per thread is allowed.
+ */
+struct criu_rseq {
+ /*
+ * Restartable sequences cpu_id_start field. Updated by the
+ * kernel. Read by user-space with single-copy atomicity
+ * semantics. This field should only be read by the thread which
+ * registered this data structure. Aligned on 32-bit. Always
+ * contains a value in the range of possible CPUs, although the
+ * value may not be the actual current CPU (e.g. if rseq is not
+ * initialized). This CPU number value should always be compared
+ * against the value of the cpu_id field before performing a rseq
+ * commit or returning a value read from a data structure indexed
+ * using the cpu_id_start value.
+ */
+ __u32 cpu_id_start;
+ /*
+ * Restartable sequences cpu_id field. Updated by the kernel.
+ * Read by user-space with single-copy atomicity semantics. This
+ * field should only be read by the thread which registered this
+ * data structure. Aligned on 32-bit. Values
+ * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED
+ * have a special semantic: the former means "rseq uninitialized",
+ * and latter means "rseq initialization failed". This value is
+ * meant to be read within rseq critical sections and compared
+ * with the cpu_id_start value previously read, before performing
+ * the commit instruction, or read and compared with the
+ * cpu_id_start value before returning a value loaded from a data
+ * structure indexed using the cpu_id_start value.
+ */
+ __u32 cpu_id;
+ /*
+ * Restartable sequences rseq_cs field.
+ *
+ * Contains NULL when no critical section is active for the current
+ * thread, or holds a pointer to the currently active struct rseq_cs.
+ *
+ * Updated by user-space, which sets the address of the currently
+ * active rseq_cs at the beginning of assembly instruction sequence
+ * block, and set to NULL by the kernel when it restarts an assembly
+ * instruction sequence block, as well as when the kernel detects that
+ * it is preempting or delivering a signal outside of the range
+ * targeted by the rseq_cs. Also needs to be set to NULL by user-space
+ * before reclaiming memory that contains the targeted struct rseq_cs.
+ *
+ * Read and set by the kernel. Set by user-space with single-copy
+ * atomicity semantics. This field should only be updated by the
+ * thread which registered this data structure. Aligned on 64-bit.
+ *
+ * 32-bit architectures should update the low order bits of the
+ * rseq_cs field, leaving the high order bits initialized to 0.
+ */
+ __u64 rseq_cs;
+
+ /*
+ * Restartable sequences flags field.
+ *
+ * This field should only be updated by the thread which
+ * registered this data structure. Read by the kernel.
+ * Mainly used for single-stepping through rseq critical sections
+ * with debuggers.
+ *
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT
+ * Inhibit instruction sequence block restart on preemption
+ * for this thread.
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL
+ * Inhibit instruction sequence block restart on signal
+ * delivery for this thread.
+ * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE
+ * Inhibit instruction sequence block restart on migration for
+ * this thread.
+ */
+ __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+#endif /* _UAPI_LINUX_RSEQ_H */
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 308a0b79b..2e21da522 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -44,6 +44,12 @@ struct rst_sched_param {
int prio;
};
+struct rst_rseq_param {
+ u64 rseq_abi_pointer;
+ u32 rseq_abi_size;
+ u32 signature;
+};
+
struct restore_posix_timer {
struct str_posix_timer spt;
struct itimerspec val;
@@ -98,6 +104,7 @@ struct thread_restore_args {
struct task_restore_args *ta;
tls_t tls;
+ struct rst_rseq_param rseq;
siginfo_t *siginfo;
unsigned int siginfo_n;
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
index f84652b8c..e17321894 100644
--- a/criu/pie/parasite.c
+++ b/criu/pie/parasite.c
@@ -8,6 +8,8 @@
#include <sys/ioctl.h>
#include <sys/uio.h>
+#include "linux/rseq.h"
+
#include "common/config.h"
#include "int.h"
#include "types.h"
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 422a12ff8..376a5025d 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -425,6 +425,28 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
return 0;
}
+static int restore_rseq(struct rst_rseq_param *rseq)
+{
+ int ret;
+
+ if (!rseq->rseq_abi_pointer) {
+ pr_debug("rseq: nothing to restore\n");
+ return 0;
+ }
+
+ pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer,
+ rseq->signature);
+
+ ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature);
+ if (ret) {
+ pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer,
+ (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret);
+ return -1;
+ }
+
+ return 0;
+}
+
static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
{
unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0;
@@ -549,6 +571,9 @@ static int restore_thread_common(struct thread_restore_args *args)
restore_tls(&args->tls);
+ if (restore_rseq(&args->rseq))
+ return -1;
+
return 0;
}
diff --git a/images/Makefile b/images/Makefile
index 2eaeb7cad..004e22ec3 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o
proto-obj-y += bpfmap-file.o
proto-obj-y += bpfmap-data.o
proto-obj-y += apparmor.o
+proto-obj-y += rseq.o
CFLAGS += -iquote $(obj)/
diff --git a/images/core.proto b/images/core.proto
index b713119f2..35079f366 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -14,6 +14,7 @@ import "timer.proto";
import "creds.proto";
import "sa.proto";
import "siginfo.proto";
+import "rseq.proto";
import "opts.proto";
@@ -101,6 +102,7 @@ message thread_core_entry {
optional string comm = 13;
optional uint64 blk_sigset_extended = 14;
+ optional rseq_entry rseq_entry = 15;
}
message task_rlimits_entry {
diff --git a/images/rseq.proto b/images/rseq.proto
new file mode 100644
index 000000000..be2800468
--- /dev/null
+++ b/images/rseq.proto
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+
+syntax = "proto2";
+
+message rseq_entry {
+ required uint64 rseq_abi_pointer = 1;
+ required uint32 rseq_abi_size = 2;
+ required uint32 signature = 3;
+}