Welcome to mirror list, hosted at ThFree Co, Russian Federation.

restorer.h « include « criu - github.com/checkpoint-restore/criu.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: d642765e3f3a663d4c277ba5e8d066c655c969a4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#ifndef __CR_RESTORER_H__
#define __CR_RESTORER_H__

#include <signal.h>
#include <limits.h>
#include <sys/resource.h>
#include <linux/filter.h>

#include "common/config.h"
#include "types.h"
#include "int.h"
#include "types.h"
#include "common/compiler.h"
#include <compel/asm/fpu.h>
#include "common/lock.h"
#include "util.h"
#include "asm/restorer.h"
#include "posix-timer.h"
#include "timerfd.h"
#include "shmem.h"
#include "parasite-vdso.h"
#include "fault-injection.h"

#include <time.h>

#include "images/mm.pb-c.h"

/*
 * These *must* be power of two values.
 */
#define RESTORE_ARGS_SIZE     (512)
#define RESTORE_STACK_REDZONE (128)
#define RESTORE_STACK_SIZE    (KILO(32))

struct restore_mem_zone {
	u8 redzone[RESTORE_STACK_REDZONE];
	u8 stack[RESTORE_STACK_SIZE];
	u8 rt_sigframe[RESTORE_STACK_SIGFRAME];
} __stack_aligned__;

struct rst_sched_param {
	int policy;
	int nice;
	int prio;
};

struct rst_rseq_param {
	u64 rseq_abi_pointer;
	u32 rseq_abi_size;
	u32 signature;
};

struct restore_posix_timer {
	struct str_posix_timer spt;
	struct itimerspec val;
	int overrun;
};

/*
 * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame,
 * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things
 * simpler, force both _args alignment be 64 bytes.
 */

struct thread_creds_args {
	CredsEntry creds;

	unsigned int cap_last_cap;

	u32 cap_inh[CR_CAP_SIZE];
	u32 cap_prm[CR_CAP_SIZE];
	u32 cap_eff[CR_CAP_SIZE];
	u32 cap_bnd[CR_CAP_SIZE];

	unsigned int secbits;
	char *lsm_profile;
	unsigned int *groups;
	char *lsm_sockcreate;

	unsigned long mem_lsm_profile_pos;
	unsigned long mem_lsm_sockcreate_pos;
	unsigned long mem_groups_pos;

	unsigned long mem_pos_next;
};

struct thread_seccomp_filter {
	struct sock_fprog sock_fprog;
	unsigned int flags;
};

struct thread_restore_args {
	struct restore_mem_zone *mz;

	int pid;
	UserRegsEntry gpregs;
	u64 clear_tid_addr;

	u64 futex_rla;
	u32 futex_rla_len;

	struct rst_sched_param sp;

	struct task_restore_args *ta;

	tls_t tls;
	struct rst_rseq_param rseq;

	siginfo_t *siginfo;
	unsigned int siginfo_n;

	int pdeath_sig;

	struct thread_creds_args *creds_args;

	int seccomp_mode;
	unsigned long seccomp_filters_pos;
	struct thread_seccomp_filter *seccomp_filters;
	void *seccomp_filters_data;
	unsigned int seccomp_filters_n;
	bool seccomp_force_tsync;

	char comm[TASK_COMM_LEN];
} __aligned(64);

typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args);

struct restore_vma_io {
	int nr_iovs;
	loff_t off;
	struct iovec iovs[0];
};

#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec))

struct task_restore_args {
	struct thread_restore_args *t; /* thread group leader */

	int fd_exe_link; /* opened self->exe file */
	int logfd;
	unsigned int loglevel;
	struct timeval logstart;

	int uffd;
	bool has_thp_enabled;

	/* threads restoration */
	int nr_threads;				 /* number of threads */
	thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */
	struct thread_restore_args *thread_args; /* array of thread arguments */
	struct task_entries *task_entries;
	void *rst_mem;
	unsigned long rst_mem_size;

	/* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */
	VmaEntry *vmas;
	unsigned int vmas_n;

	int vma_ios_fd;
	struct restore_vma_io *vma_ios;
	unsigned int vma_ios_n;

	struct restore_posix_timer *posix_timers;
	unsigned int posix_timers_n;

	struct restore_timerfd *timerfd;
	unsigned int timerfd_n;

	siginfo_t *siginfo;
	unsigned int siginfo_n;

	struct rst_tcp_sock *tcp_socks;
	unsigned int tcp_socks_n;

	struct rst_aio_ring *rings;
	unsigned int rings_n;

	struct rlimit64 *rlims;
	unsigned int rlims_n;

	pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */;
	unsigned int helpers_n;

	pid_t *zombies;
	unsigned int zombies_n;

	int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */
	unsigned int inotify_fds_n;

	/* * * * * * * * * * * * * * * * * * * * */

	unsigned long task_size;
	unsigned long premmapped_addr;
	unsigned long premmapped_len;
	rt_sigaction_t sigchld_act;

	void *bootstrap_start;
	unsigned long bootstrap_len;

	struct itimerval itimers[3];

	MmEntry mm;
	auxv_t mm_saved_auxv[AT_VECTOR_SIZE];
	u32 mm_saved_auxv_size;
	char comm[TASK_COMM_LEN];

	/*
	 * proc_fd is a handle to /proc that the restorer blob can use to open
	 * files there, because some of them can't be opened before the
	 * restorer blob is called.
	 */
	int proc_fd;

	int seccomp_mode;

	bool compatible_mode;

	bool can_map_vdso;
	bool auto_dedup;
	unsigned long vdso_rt_size;
	struct vdso_maps vdso_maps_rt;	 /* runtime vdso symbols */
	unsigned long vdso_rt_parked_at; /* safe place to keep vdso */
	void **breakpoint;

	enum faults fault_strategy;
#ifdef ARCH_HAS_LONG_PAGES
	unsigned page_size;
#endif
	int lsm_type;
	int child_subreaper;
	bool has_clone3_set_tid;

	/*
	 * info about rseq from libc used to
	 * unregister it before memory restoration procedure
	 */
	struct rst_rseq_param libc_rseq;

	uid_t uid;
	u32 cap_eff[CR_CAP_SIZE];
} __aligned(64);

/*
 * For arm64 stack needs to aligned to 16 bytes.
 * Hence align to 16 bytes for all
*/
#define RESTORE_ALIGN_STACK(start, size) (ALIGN((start) + (size)-16, 16))

static inline unsigned long restorer_stack(struct restore_mem_zone *mz)
{
	return RESTORE_ALIGN_STACK((long)&mz->stack, RESTORE_STACK_SIZE);
}

enum {
	/*
	 * Restore stages. The stage is started by criu process, then
	 * confirmed by all tasks involved in it. Then criu does some
	 * actions and starts the next stage.
	 *
	 * The first stated stage is CR_STATE_ROOT_TASK which is started
	 * right before calling fork_with_pid() for the root_item.
	 */
	CR_STATE_FAIL = -1,
	/*
	 * Root task is created and does some pre-checks.
	 * After the stage ACT_SETUP_NS scripts are performed.
	 */
	CR_STATE_ROOT_TASK = 0,
	/*
	 * The prepare_namespace() is called.
	 * After the stage criu opens root task's mntns and
	 * calls ACT_POST_SETUP_NS scripts.
	 */
	CR_STATE_PREPARE_NAMESPACES,
	/*
	 * All tasks fork and call open_transport_socket().
	 * Stage is needed to make sure they all have the socket.
	 * Also this stage is a sync point after which the
	 * fini_restore_mntns() can be called.
	 *
	 * This stage is a little bit special. Normally all stages
	 * are controlled by criu process, but when this stage
	 * starts criu process starts waiting for the tasks to
	 * finish it, but by the time it gets woken up the stage
	 * finished is CR_STATE_RESTORE. The forking stage is
	 * barrier-ed by the root task, this task is also the one
	 * that switches the stage (into restoring).
	 *
	 * The above is done to lower the amount of context
	 * switches from root task to criu and back, since the
	 * separate forking stage is not needed by criu, it's
	 * purely to make sure all tasks be in sync.
	 */
	CR_STATE_FORKING,
	/*
	 * Main restore stage. By the end of it all tasks are
	 * almost ready and what's left is:
	 *   pick up zombies and helpers
	 *   restore sigchild handlers used to detect restore errors
	 *   restore credentials, seccomp, dumpable and pdeath_sig
	 */
	CR_STATE_RESTORE,
	/*
	 * Tasks restore sigchild handlers.
	 * Stage is needed to synchronize the change in error
	 * propagation via sigchild.
	 */
	CR_STATE_RESTORE_SIGCHLD,
	/*
	 * Final stage.
	 * For security reason processes can be resumed only when all
	 * credentials are restored. Otherwise someone can attach to a
	 * process, which are not restored credentials yet and execute
	 * some code.
	 * Seccomp needs to be restored after creds.
	 * Dumpable and pdeath signal are restored after seccomp.
	 */
	CR_STATE_RESTORE_CREDS,
	CR_STATE_COMPLETE
};

#define restore_finish_stage(__v, __stage)                  \
	({                                                  \
		futex_dec_and_wake(&(__v)->nr_in_progress); \
		futex_wait_while(&(__v)->start, __stage);   \
		(s32) futex_get(&(__v)->start);             \
	})

#define __r_sym(name)		  restorer_sym##name
#define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name))

#endif /* __CR_RESTORER_H__ */