#!/bin/bash # # A convenience shell script to call criu for checkpointing and restoring # a Docker container. # # This script saves the user from having to remember all the command # line options, some of which are very long. Note that once Docker # has native support for checkpoint and restore, there will no longer # be a need for this particular shell script. # set -o errexit set -o nounset set -o pipefail # # These can be set in the environment to override their defaults. # Note that while the default value of CRIU_IMG_DIR in this script # is a directory in DOCKER_HOME, it doesn't have to be tied to # DOCKER_HOME. For example, it can be /var/spool/criu_img. # : ${DOCKER_HOME=/var/lib/docker} : ${DOCKER_BINARY=docker} : ${CRIU_IMG_DIR=${DOCKER_HOME}/criu_img} : ${CRIU_BINARY=criu} : ${DOCKERINIT_BINARY=} # # Patterns for different filesystem types in dump.log. # readonly AUFS_PATTERN='/sys/fs/aufs/si_' readonly OVERLAYFS_PATTERN='type.*source.*options.*lowerdir=.*upperdir=.*workdir=' readonly UNIONFS_PATTERN='type.*source.*options.*dirs=' # # These globals will be set by init_container_vars() # declare CID declare CONTAINER_IMG_DIR declare CONTAINER_DUMP_LOG declare -A BIND_MOUNT BIND_MOUNT[/etc/resolv.conf]=.ResolvConfPath BIND_MOUNT[/etc/hosts]=.HostsPath BIND_MOUNT[/etc/hostname]=.HostnamePath MOUNT_MAP_ARGS=() # # The default mode is non-verbose, printing only a short message # saying if the command succeeded or failed. For the verbose mode, # we could have used set -o xtrace but this option would have # generated excessive output suitable for debugging, not normal # usage. So we set ${ECHO} to echo in the verbose mode to print # selected messages. # VERBOSE="" ECHO=":" CMD="" PGNAME=$(basename "$0") usage() { local rv=0 if [[ -n "${1-}" ]]; then rv=1 echo -e "${PGNAME}: $1\n" >&2 fi cat <] -c, --checkpoint checkpoint container -h, --help print help message -r, --restore restore container -v, --verbose enable verbose mode Environment: DOCKER_HOME (default ${DOCKER_HOME}) CRIU_IMG_DIR (default ${CRIU_IMG_DIR}) DOCKER_BINARY (default ${DOCKER_BINARY}) DOCKERINIT_BINARY (default \${DOCKER_HOME}/init/dockerinit--dev) CRIU_BINARY (default ${CRIU_BINARY}) EOF exit ${rv} } # # If the user has not specified a bind mount file for the container's # /.dockerinit, try to determine it from the Docker version. # find_dockerinit() { local v if [[ -z "${DOCKERINIT_BINARY}" ]]; then v=$("${DOCKER_BINARY}" --version | sed -e 's/.*version \(.*\),.*/\1/') DOCKERINIT_BINARY="${DOCKER_HOME}/init/dockerinit-${v}" elif [[ "${DOCKERINIT_BINARY}" != /* ]]; then DOCKERINIT_BINARY="${DOCKER_HOME}/init/${DOCKERINIT_BINARY}" fi if [[ ! -x "${DOCKERINIT_BINARY}" ]]; then echo "${DOCKERINIT_BINARY} does not exist" exit 1 fi BIND_MOUNT[/.dockerinit]="${DOCKERINIT_BINARY}" } parse_args() { local args local flags args=$(getopt --options 'chrv' \ --longoptions 'checkpoint help restore verbose' -- "$@") [[ $? == 0 ]] || usage eval set -- "${args}" while :; do arg="${1}" shift case "${arg}" in -c|--checkpoint) CMD="dump" ;; -h|--help) usage ;; -r|--restore) CMD="restore" ;; -v|--verbose) VERBOSE="-v"; ECHO="echo" ;; --) break ;; *) usage "internal error parsing arguments!" ;; esac done [[ "${CMD}" == "" ]] && usage "need either -c or -r" [[ $# -gt 1 ]] && usage "$# too many arguments" # if no container id in args, prompt the user if [[ $# -eq 1 ]]; then CID="$1" else if [[ "${CMD}" == "dump" ]]; then flags="" else # we need -a only for restore flags="-a" fi "${DOCKER_BINARY}" ps ${flags} read -rp $'\nContainer ID: ' CID fi } execute() { # since commands are pretty long and can wrap around # several lines, print a blank line to make it visually # easier to see ${ECHO} -e "\n$*" "$@" } init_container_vars() { local d CID=$(get_container_conf .Id) d=$("${DOCKER_BINARY}" info 2> /dev/null | awk '/Storage Driver:/ { print $3 }') if [[ "${d}" == "vfs" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/dir/${CID}" elif [[ "${d}" == "aufs" || "${d}" == "unionfs" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/mnt/${CID}" elif [[ "${d}" == "overlay" ]]; then CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/${CID}/merged" else echo "${d}: unknown filesystem type" return 1 fi CONTAINER_IMG_DIR="${CRIU_IMG_DIR}/${CID}" CONTAINER_DUMP_LOG="${CONTAINER_IMG_DIR}/dump.log" } get_container_conf() { local val val=$("${DOCKER_BINARY}" inspect --format "{{$1}}" "${CID}") [[ "${val}" == "" ]] && exit 1 echo "${val//}" } setup_mount_map() { local key if [[ "$1" == "dump" ]]; then for key in "${!BIND_MOUNT[@]}"; do MOUNT_MAP_ARGS+=(--ext-mount-map "${key}:${key}") done else for key in "${!BIND_MOUNT[@]}"; do if [[ "${key}" == "/.dockerinit" ]]; then MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:${BIND_MOUNT[$key]}") else MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:$(get_container_conf "${BIND_MOUNT[$key]}")") fi done fi } fs_mounted() { if grep -wq "$1" /proc/self/mountinfo; then ${ECHO} "container root directory already mounted" return 0 fi ${ECHO} "container root directory not mounted" return 1 } # # Pretty print the mount command in verbose mode by putting each branch # pathname on a single line for easier visual inspection. # pp_mount() { ${ECHO} -e "\nmount -t $1 -o" ${ECHO} "${2}" | tr ':,' '\n' ${ECHO} "${3}" ${ECHO} "${4}" } # # Reconstruct the AUFS filesystem from information in CRIU's dump log. # The dump log has a series of branch entries for each process in the # entire process tree in the following form: # # (00.014075) /sys/fs/aufs/si_f598876b0855b883/br0 : /var/lib/docker/aufs/diff/ # # Note that this script assumes that all processes in the process # tree have the same AUFS filesystem. This assumption is fairly # safe for typical Docker containers. # setup_aufs() { local -r tmpf="${CONTAINER_IMG_DIR}/aufs.br" local br local branches # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return # create a temporary file with branches listed in # ascending order (line 1 is branch 0) awk '/aufs.si_/ { print $2, $4 }' "${CONTAINER_DUMP_LOG}" | \ sort | uniq | awk '{ print $2 }' > "${tmpf}" # construct the mount option string from branches branches="" while read br; do branches+="${branches:+:}${br}" done < "${tmpf}" # mount the container's filesystem pp_mount "aufs" "${branches}" "none" "${CONTAINER_ROOT_DIR}" mount -t aufs -o br="${branches}" none "${CONTAINER_ROOT_DIR}" rm -f "${tmpf}" } setup_overlayfs() { local lowerdir local upperdir local workdir local ovlydirs local -r f="${CONTAINER_DUMP_LOG}" # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return lowerdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*lowerdir=\([^,]*\).*/\1/p') upperdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*upperdir=\([^,]*\).*/\1/p') workdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*workdir=\([^,]*\).*/\1/p') ovlydirs="lowerdir=${lowerdir},upperdir=${upperdir},workdir=${workdir}" # mount the container's filesystem pp_mount "overlay" "${ovlydirs}" "overlay" "${CONTAINER_ROOT_DIR}" mount -t overlay -o "${ovlydirs}" overlay "${CONTAINER_ROOT_DIR}" } # # Reconstruct the UnionFS filesystem from information in CRIU's dump log. # The dump log has the mountinfo root entry for the filesystem. The # options field contains the list of directories that make up the UnionFS. # # Note that this script assumes that all processes in the process # tree have the same UnionFS filesystem. This assumption is fairly # safe for typical Docker containers. # # XXX If /dev/null was manually created by Docker (i.e., it's not in # a branch), create it. Although this has worked so far, it needs # a deeper look as I am not sure if /dev/null should be created as # a regular file to be the target of a bind mount or created as a # device file by mknod. # setup_unionfs() { local dirs # nothing to do if filesystem already mounted fs_mounted "${CONTAINER_ROOT_DIR}" && return dirs=$(sed -n -e 's/.*type.*dirs=/dirs=/p' "${CONTAINER_DUMP_LOG}") [[ "${dirs}" = "" ]] && echo "do not have branch information" && exit 1 # mount the container's filesystem pp_mount "unionfs" "${dirs}" "none" "${CONTAINER_ROOT_DIR}" mount -t unionfs -o "${dirs}" none "${CONTAINER_ROOT_DIR}" # see comment at the beginning of the function if [[ ! -e "${CONTAINER_ROOT_DIR}/dev/null" ]]; then execute touch "${CONTAINER_ROOT_DIR}/dev/null" fi } prep_dump() { local pid pid=$(get_container_conf .State.Pid) # docker returns 0 for containers it thinks have exited # (i.e., dumping a restored container again) if [[ ${pid} -eq 0 ]]; then echo -e "\nCheckpointing a restored container?" read -p "Process ID: " pid fi # remove files previously created by criu but not others files (if any) mkdir -p "${CONTAINER_IMG_DIR}" rm -f "${CONTAINER_IMG_DIR}"/*.{img,log,pid} "${CONTAINER_IMG_DIR}"/stats-restore CMD_ARGS=("-t" "${pid}") # we need --root only for aufs to compensate for the # erroneous information in /proc//map_files if [[ "${CONTAINER_ROOT_DIR}" == *aufs* ]]; then CMD_ARGS+=("--root" "${CONTAINER_ROOT_DIR}") fi } # # Set up container's root filesystem if not already set up. # prep_restore() { local -r f="${CONTAINER_DUMP_LOG}" if [[ ! -f "${f}" ]]; then echo "${f} does not exist" return 1 fi if grep -q "${AUFS_PATTERN}" "${f}"; then setup_aufs elif grep -q "${OVERLAYFS_PATTERN}" "${f}"; then setup_overlayfs elif grep -q "${UNIONFS_PATTERN}" "${f}"; then setup_unionfs fi # criu requires this (due to container using pivot_root) if ! grep -qw "${CONTAINER_ROOT_DIR}" /proc/self/mountinfo; then execute mount --rbind "${CONTAINER_ROOT_DIR}" "${CONTAINER_ROOT_DIR}" MOUNTED=1 else MOUNTED=0 fi CMD_ARGS=("-d" "--root" "${CONTAINER_ROOT_DIR}" "--pidfile" "${CONTAINER_IMG_DIR}/restore.pid") } # # Since this function produces output string (either in the # verbose mode or from ${CRIU_BINARY}), we set the return value # in parameter 1. # run_criu() { local -a common_args=("-v4" "-D" "${CONTAINER_IMG_DIR}" \ "-o" "${CMD}.log" \ "--manage-cgroups" \ "--evasive-devices") setup_mount_map "${CMD}" common_args+=("${MOUNT_MAP_ARGS[@]}") # we do not want to exit if there's an error execute "${CRIU_BINARY}" "${CMD}" "${common_args[@]}" "${CMD_ARGS[@]}" } wrap_up() { local -r logf="${CONTAINER_IMG_DIR}/${CMD}.log" local -r pidf="${CONTAINER_IMG_DIR}/restore.pid" if [[ $1 -eq 0 ]]; then ${ECHO} -e "\n" echo "${CMD} successful" else ${ECHO} -e "\n" echo "${CMD} failed" fi if [[ "${VERBOSE}" == "-v" && -e "${logf}" ]]; then if ! grep "finished successfully" "${logf}"; then grep Error "${logf}" fi fi if [[ "${CMD}" == "restore" ]]; then if [[ ${MOUNTED} -eq 1 ]]; then execute umount "${CONTAINER_ROOT_DIR}" fi if [[ -e "${pidf}" ]]; then ${ECHO} -e "\n$(ps -f -p "$(cat "${pidf}")" --no-headers)" fi fi } resolve_path() { local p p="${2}" if which realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" } resolve_cmd() { local cpath cpath=$(which "${2}") resolve_path "${1}" "${cpath}" } main() { local rv=0 if [[ $(id -u) -ne 0 ]]; then echo "not running as root" exit 1 fi parse_args "$@" find_dockerinit init_container_vars if [[ "${VERBOSE}" == "-v" ]]; then echo resolve_cmd "docker binary" "${DOCKER_BINARY}" resolve_cmd "dockerinit binary" "${DOCKERINIT_BINARY}" resolve_cmd "criu binary" "${CRIU_BINARY}" resolve_path "image directory" "${CONTAINER_IMG_DIR}" resolve_path "container root directory" "${CONTAINER_ROOT_DIR}" fi if [[ "${CMD}" == "dump" ]]; then prep_dump else prep_restore fi run_criu || rv=$? wrap_up ${rv} exit ${rv} } main "$@"