#include #include #include #include #include #include #include #include #include #include #include #include "cr_options.h" #include "util.h" #include "util-pie.h" #include "log.h" #include "plugin.h" #include "filesystems.h" #include "mount.h" #include "mount-v2.h" #include "pstree.h" #include "image.h" #include "namespaces.h" #include "protobuf.h" #include "fs-magic.h" #include "path.h" #include "files-reg.h" #include "external.h" #include "clone-noasan.h" #include "fdstore.h" #include "rst-malloc.h" #include "images/mnt.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "mnt: " #define CONTEXT_OPT "context=" /* A helper mount_info entry for the roots yard */ struct mount_info *root_yard_mp = NULL; static LIST_HEAD(delayed_unbindable); char *service_mountpoint(const struct mount_info *mi) { if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { BUG_ON(!mi->plain_mountpoint); return mi->plain_mountpoint; } return mi->mountpoint; } int ext_mount_add(char *key, char *val) { cleanup_free char *e_str = NULL; e_str = xmalloc(strlen(key) + strlen(val) + 8); if (!e_str) return -1; /* * On dump the key is the mountpoint as seen from the mount * namespace, the val is some name that will be put into image * instead of the mount point's root path. * * On restore the key is the name from the image (the one * mentioned above) and the val is the path in criu's mount * namespace that will become the mount point's root, i.e. -- * be bind mounted to the respective mountpoint. */ sprintf(e_str, "mnt[%s]:%s", key, val); return add_external(e_str); } int ext_mount_parse_auto(char *key) { opts.autodetect_ext_mounts = true; if (*key == ':') { key++; if (*key == 'm') opts.enable_external_masters = true; else if (*key == 's') opts.enable_external_sharing = true; else if (*key != '\0') return -1; } return 0; } /* Lookup ext_mount by key field */ static char *ext_mount_lookup(char *key) { char *v; int len = strlen(key); char mkey[len + 6]; sprintf(mkey, "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; return v; } /* * Single linked list of mount points get from proc/images */ struct mount_info *mntinfo; static void mntinfo_add_list(struct mount_info *new) { if (!mntinfo) mntinfo = new; else { struct mount_info *pm; /* Add to the tail. (FIXME -- make O(1) ) */ for (pm = mntinfo; pm->next != NULL; pm = pm->next) ; pm->next = new; } } void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new) { new->next = *head; *head = new; } static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { /* * Goes through all entries in the mountinfo table * looking for a mount point that contains the file specified * in rpath. Uses the device number st_dev and the inode number st_ino * to make sure the file is correct. */ struct mount_info *mi_ret = NULL; struct mount_info *m; int mntns_root = -1; for (m = list; m != NULL; m = m->next) { struct stat f_stat; int ret_stat; if (m->fstype->code != FSTYPE__OVERLAYFS) continue; /* * We need the mntns root fd of the process to be dumped, * to make sure we stat the correct file */ if (mntns_root == -1) { mntns_root = __mntns_get_root_fd(root_item->pid->real); if (mntns_root < 0) { pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid->real); return ERR_PTR(-ENOENT); } } /* * Concatenates m->ns_mountpoint with rpath and attempts * to stat the resulting path at mntns_root */ if (is_root_mount(m)) { ret_stat = fstatat(mntns_root, rpath, &f_stat, 0); } else { char _full_path[PATH_MAX]; int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->ns_mountpoint, rpath); if (n >= PATH_MAX) { pr_err("Not enough space to concatenate %s and %s\n", m->ns_mountpoint, rpath); return ERR_PTR(-ENOSPC); } ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0); } if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino) mi_ret = m; } return mi_ret; } /* * Looks up the mnt_id and path of a file in an overlayFS directory. * * This is useful in order to fix the OverlayFS bug present in the * Linux Kernel before version 4.2. See fixup_overlayfs for details. * * We first check to see if the mnt_id and st_dev numbers currently match * some entry in the mountinfo table. If so, we already have the correct mnt_id * and no fixup is needed. * * Then we proceed to see if there are any overlayFS mounted directories * in the mountinfo table. If so, we concatenate the mountpoint with the * name of the file, and stat the resulting path to check if we found the * correct device id and node number. If that is the case, we update the * mount id and link variables with the correct values. */ struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { struct mount_info *m; /* If the mnt_id and device number match for some entry, no fixup is needed */ for (m = mntinfo; m != NULL; m = m->next) if (st_dev == kdev_to_odev(m->s_dev) && mnt_id == m->mnt_id) return NULL; return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id); } static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id) { struct mount_info *m; for (m = list; m != NULL; m = m->next) if (m->mnt_id == id) return m; return NULL; } struct mount_info *lookup_mnt_id(unsigned int id) { return __lookup_mnt_id(mntinfo, id); } struct mount_info *lookup_mnt_sdev(unsigned int s_dev) { struct mount_info *m; for (m = mntinfo; m != NULL; m = m->next) /* * We should not provide notdir bindmounts to open_mount as * opening them can fail/hang for binds of unix sockets/fifos */ if (m->s_dev == s_dev && mnt_is_dir(m)) return m; pr_err("Unable to find suitable mount point for s_dev %x\n", s_dev); return NULL; } static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path) { size_t pathlen = strlen(path); struct mount_info *m = mntinfo_tree, *c; while (1) { list_for_each_entry(c, &m->children, siblings) { size_t n; n = strlen(c->ns_mountpoint + 1); if (n > pathlen) continue; if (strncmp(c->ns_mountpoint + 1, path, min(n, pathlen))) continue; if (n < pathlen && path[n] != '/') continue; m = c; break; } if (&c->siblings == &m->children) break; } pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->ns_mountpoint); return m; } dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path) { struct mount_info *m; m = mount_resolve_path(ns->mnt.mntinfo_tree, path); /* * BTRFS returns subvolume dev-id instead of * superblock dev-id, in such case return device * obtained from mountinfo (ie subvolume0). */ return strcmp(m->fstype->name, "btrfs") ? MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev; } bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *ns, const char *path) { if (st_dev == kdev_to_odev(phys_dev)) return true; return phys_dev == phys_stat_resolve_dev(ns, st_dev, path); } /* * Compare super-blocks mounted at two places */ static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b) { if (a->s_dev != b->s_dev) return false; /* * If one of compared mounts is external its mount info can have fstype * and source fields changed by resolve_external_mounts() or * try_resolve_ext_mount(), but we still want to detect bindmounts of * this external mount, so let's skip source and fstype checks for it. */ if (!a->external && !b->external) { if (strcmp(a->source, b->source) != 0) return false; if (a->fstype != b->fstype) return false; if (a->fstype->sb_equal) return a->fstype->sb_equal(a, b); } else { if (a->fstype->sb_equal) return a->fstype->sb_equal(a, b); else if (b->fstype->sb_equal) return b->fstype->sb_equal(a, b); } if (strcmp(a->options, b->options)) return false; return true; } /* * Compare superblocks AND the way they are mounted */ static bool mounts_equal(struct mount_info *a, struct mount_info *b) { if (!mounts_sb_equal(a, b)) return false; if (strcmp(a->root, b->root)) return false; return true; } /* * mnt_roots is a temporary directory for restoring sub-trees of * non-root namespaces. */ char *mnt_roots; static struct mount_info *mnt_build_ids_tree(struct mount_info *list) { struct mount_info *m, *root = NULL; /* * Just resolve the mnt_id:parent_mnt_id relations */ pr_debug("\tBuilding plain mount tree\n"); for (m = list; m != NULL; m = m->next) { struct mount_info *parent; pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id); if (m->mnt_id != m->parent_mnt_id) parent = __lookup_mnt_id(list, m->parent_mnt_id); else /* a circular mount reference. It's rootfs or smth like it. */ parent = NULL; if (!parent) { /* Only a root mount can be without parent */ if (!root && m->is_ns_root) { root = m; continue; } pr_err("No parent found for mountpoint %d (@%s)\n", m->mnt_id, m->ns_mountpoint); return NULL; } m->parent = parent; list_add_tail(&m->siblings, &parent->children); } if (!root) { pr_err("No root found for tree\n"); return NULL; } return root; } static unsigned int mnt_depth(struct mount_info *m) { unsigned int depth = 0; char *c; for (c = m->ns_mountpoint; *c != '\0'; c++) if (*c == '/') depth++; return depth; } static void __mnt_resort_children(struct mount_info *parent) { LIST_HEAD(list); /* * Put children mounts in an order they can be (u)mounted * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/ * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order. * Otherwise we will not be able to (u)mount them in a sequence. * * Funny, but all we need for this is to sort them in the descending * order of the amount of /-s in a path =) * * Use stupid insertion sort here, we're not expecting mount trees * to contain hundreds (or more) elements. */ pr_info("\tResorting children of %d in mount order\n", parent->mnt_id); while (!list_empty(&parent->children)) { struct mount_info *m, *p; unsigned int depth; m = list_first_entry(&parent->children, struct mount_info, siblings); list_del(&m->siblings); depth = mnt_depth(m); list_for_each_entry(p, &list, siblings) if (mnt_depth(p) < depth) break; list_add_tail(&m->siblings, &p->siblings); } list_splice(&list, &parent->children); } static struct mount_info *mnt_subtree_next(struct mount_info *mi, struct mount_info *root); static void resort_siblings(struct mount_info *root, void (*resort_children)(struct mount_info *)) { struct mount_info *mi = root; while (1) { /* * Explanation: sorting the children of the tree like these is * safe and does not break the tree search in mnt_subtree_next * (DFS-next search), as we sort children before calling next * on parent and thus before DFS-next ever touches them, so * from the perspective of DFS-next all children look like they * are already sorted. */ resort_children(mi); mi = mnt_subtree_next(mi, root); if (!mi) break; } } static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m; pr_info("%*s[%s](%d->%d)\n", off, "", tree->ns_mountpoint, tree->mnt_id, tree->parent_mnt_id); list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1); pr_info("%*s<--\n", off, ""); } /* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ static int try_resolve_ext_mount(struct mount_info *info) { char devstr[64]; /* * Only allow mountpoint-external mounts in root mntns. Their lookup is * based on mountpoint path, but in nested mntns we can have completely * different mount tree and at same mountpoint we can have completely * different mount. */ if (info->nsid->type == NS_ROOT) { char *ext; ext = ext_mount_lookup(info->ns_mountpoint + 1 /* trim the . */); if (ext) { pr_info("Found %s mapping for %s mountpoint\n", ext, info->ns_mountpoint); info->external = ext; return 1; } } snprintf(devstr, sizeof(devstr), "dev[%d/%d]", kdev_major(info->s_dev), kdev_minor(info->s_dev)); if (info->fstype->code == FSTYPE__UNSUPPORTED && fsroot_mounted(info)) { char *val; val = external_lookup_by_key(devstr); if (!IS_ERR_OR_NULL(val)) { char *source; int len; pr_info("Found %s dev-mapping for %s(%d) mountpoint\n", val, info->ns_mountpoint, info->mnt_id); info->external = EXTERNAL_DEV_MOUNT; len = strlen(val) + sizeof("dev[]"); source = xrealloc(info->source, len); if (source == NULL) return -1; snprintf(source, len, "dev[%s]", val); info->fstype = fstype_auto(); BUG_ON(info->fstype->code != FSTYPE__AUTO); info->source = source; return 1; } } return 0; } /* * Find the mount_info from which the respective bind-mount * can be created. It can be either an FS-root mount, or the * root of the tree (the latter only if its root path is the * sub-path of the bind mount's root). */ static struct mount_info *find_fsroot_mount_for(struct mount_info *bm) { struct mount_info *sm; list_for_each_entry(sm, &bm->mnt_bind, mnt_bind) if (fsroot_mounted(sm) || (sm->parent == root_yard_mp && strstartswith(bm->root, sm->root))) return sm; return NULL; } static bool mnt_needs_remap(struct mount_info *m) { struct mount_info *t; if (!m->parent || m->parent == root_yard_mp) return false; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(t->ns_mountpoint, m->ns_mountpoint)) return true; } /* * If we are children-overmount and parent is remapped, we should be * remapped too, else fixup_remap_mounts() won't be able to move parent * to it's real place, it will move child instead. */ if (!strcmp(m->parent->ns_mountpoint, m->ns_mountpoint)) return mnt_needs_remap(m->parent); return false; } static bool __mnt_is_external_bind(struct mount_info *mi, struct mount_info *bind) { if (bind->external && is_sub_path(mi->root, bind->root)) return true; return false; } /* * Say mount is external if it was explicitly specified as an external or it * can be bind-mounted from such an explicit external mount. */ struct mount_info *mnt_get_external_bind(struct mount_info *mi) { return mnt_bind_pick(mi, __mnt_is_external_bind); } bool mnt_is_external_bind(struct mount_info *mi) { return mnt_get_external_bind(mi); } static bool __can_receive_master_from_external(struct mount_info *mi, struct mount_info *bind) { if (mnt_is_nodev_external(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) return true; return false; } static struct mount_info *can_receive_master_from_external(struct mount_info *mi) { return mnt_bind_pick(mi, __can_receive_master_from_external); } static bool __has_mounted_external_bind(struct mount_info *mi, struct mount_info *bind) { if (bind->external && bind->mounted && is_sub_path(mi->root, bind->root)) return true; return false; } bool has_mounted_external_bind(struct mount_info *mi) { return mnt_bind_pick(mi, __has_mounted_external_bind); } bool rst_mnt_is_root(struct mount_info *mi) { return (mi->is_ns_root && mi->nsid->id == root_item->ids->mnt_ns_id); } static bool __mnt_is_root_bind(struct mount_info *mi, struct mount_info *bind) { if (rst_mnt_is_root(bind) && is_sub_path(mi->root, bind->root)) return true; return false; } struct mount_info *mnt_get_root_bind(struct mount_info *mi) { return mnt_bind_pick(mi, __mnt_is_root_bind); } bool mnt_is_root_bind(struct mount_info *mi) { return mnt_get_root_bind(mi); } static bool __can_receive_master_from_root(struct mount_info *mi, struct mount_info *bind) { if (rst_mnt_is_root(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) return true; return false; } static struct mount_info *can_receive_master_from_root(struct mount_info *mi) { return mnt_bind_pick(mi, __can_receive_master_from_root); } static bool __mnt_is_external_bind_nodev(struct mount_info *mi, struct mount_info *bind) { if (bind->external && !mnt_is_dev_external(bind) && is_sub_path(mi->root, bind->root)) return true; return false; } struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi) { return mnt_bind_pick(mi, __mnt_is_external_bind_nodev); } /* * Having two children with same mountpoint is unsupported. That can happen in * case of mount propagation inside of shared mounts, in that case it is hard * to find out mount propagation siblings and which of these mounts is above * (visible) and which is beneath (hidden). It would've broken mount restore * order in can_mount_now and also visibility assumptions in open_mountpoint. * * Anyway after kernel v4.11 such mounts will be impossible. */ static int validate_children_collision(struct mount_info *mnt) { struct mount_info *chi, *chj; list_for_each_entry(chi, &mnt->children, siblings) { list_for_each_entry(chj, &mnt->children, siblings) { if (chj == chi) break; if (!strcmp(chj->ns_mountpoint, chi->ns_mountpoint)) { pr_err("Mount %d has two children with same " "mountpoint: %d %d\n", mnt->mnt_id, chj->mnt_id, chi->mnt_id); return -1; } } } return 0; } int validate_mounts(struct mount_info *info, bool for_dump) { struct mount_info *m, *t; for (m = info; m; m = m->next) { if (validate_children_collision(m)) return -1; if (mnt_is_external_bind(m)) continue; if (mnt_is_root_bind(m)) continue; /* * Mountpoint can point to / of an FS. In that case this FS * should be of some known type so that we can just mount one. * * Otherwise it's a bindmount mountpoint and we try to find * what fsroot mountpoint it's bound to. If this point is the * root mount, the path to bindmount root should be accessible * form the rootmount path (the strstartswith check in the * else branch below). */ if (fsroot_mounted(m)) { if (m->fstype->code == FSTYPE__UNSUPPORTED) { pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", m->ns_mountpoint, m->s_dev, m->root, m->mnt_id); return -1; } } else { t = find_fsroot_mount_for(m); if (!t) { int ret; /* * No root-mount found for this bind and it's neither * marked nor auto-resolved as external one. So last * chance not to fail is to talk to plugins. */ if (for_dump) { ret = run_plugins(DUMP_EXT_MOUNT, m->ns_mountpoint, m->mnt_id); if (ret == 0) m->need_plugin = true; } else /* * Plugin should take care of this one * in restore_ext_mount, or do_bind_mount * will mount it as external */ ret = m->need_plugin ? 0 : -ENOTSUP; if (ret < 0) { if (ret == -ENOTSUP) pr_err("%d:%s doesn't have a proper root mount\n", m->mnt_id, m->ns_mountpoint); return -1; } } } } return 0; } static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info) { struct mount_info *it, *candidate = NULL; for (it = list; it; it = it->next) { if (!mounts_sb_equal(info, it)) continue; /* * This means we have a situation like: * * root@criu:~# mount --bind bind1/subdir/ bind2 * root@criu:~# mount --bind bind1/ bind3 * * outside the container, and bind1 is directly bind mounted * inside the container. mounts_equal() considers these mounts * equal for bind purposes, but their roots are different, and * we want to match the one with the right root. */ if (!issubpath(info->root, it->root)) continue; candidate = it; /* * Consider the case of: * * mount /xxx * mount --bind /xxx /yyy * mount --make-shared /yyy * mount --bind /xxx /zzz * mount --make-shared /zzz * bind mount a shared mount into the namespace * * Here, we want to return the /right/ mount, not just a mount * that's equal. However, in the case: * * bind mount a shared mount into the namespace * inside the namespace, remount MS_PRIVATE * inside the namespace, remount MS_SHARED * * there will be no external mount with matching sharing * because the sharing is only internal; we still want to bind * mount from this mountinfo so we should return it, but we * should make the sharing namespace private after that bind * mount. * * Below are the cases where we found an exact match. */ if (info->flags & MS_SHARED && info->shared_id == it->shared_id) return candidate; if (info->flags & MS_SLAVE && info->master_id == it->shared_id) return candidate; } return candidate; } static struct ns_id *find_ext_ns_id(void) { struct ns_id *ns; for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, true)) break; return ns; } pr_err("Failed to find criu pid's mount ns\n"); return NULL; } static int resolve_external_mounts(struct mount_info *info) { struct ns_id *ext_ns = NULL; struct mount_info *m; if (opts.autodetect_ext_mounts) { ext_ns = find_ext_ns_id(); if (!ext_ns) return -1; } for (m = info; m; m = m->next) { int ret; char *p, *cut_root; struct mount_info *match; if (m->parent == NULL || m->is_ns_root) continue; ret = try_resolve_ext_mount(m); if (ret < 0) return ret; if (ret == 1 || !ext_ns) continue; match = find_best_external_match(ext_ns->mnt.mntinfo_list, m); if (!match) continue; if (m->flags & MS_SHARED) { if (!opts.enable_external_sharing) continue; if (m->shared_id != match->shared_id) m->internal_sharing = true; } if (m->flags & MS_SLAVE) { if (!opts.enable_external_masters) continue; /* * In order to support something like internal slavery, * we need to teach can_mount_now and do_mount_one * about slavery relationships in external mounts. This * seems like an uncommon case, so we punt for not. */ if (m->master_id != match->shared_id && m->master_id != match->master_id) continue; } cut_root = cut_root_for_bind(m->root, match->root); p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); if (!p) return -1; m->external = AUTODETECTED_MOUNT; /* * Put the guessed name in source. It will be picked up * as auto-root in get_mp_root() on restore. */ xfree(m->source); m->source = p; pr_info("autodetected external mount %s for %s(%d)\n", p, m->ns_mountpoint, m->mnt_id); } return 0; } static int root_path_from_parent(struct mount_info *m, char *buf, int size) { bool head_slash = false, tail_slash = false; int p_len, m_len, len; if (!m->parent || m->parent == root_yard_mp) return -1; p_len = strlen(m->parent->ns_mountpoint); m_len = strlen(m->ns_mountpoint); len = snprintf(buf, size, "%s", m->parent->root); if (len >= size) return -1; BUG_ON(len <= 0); if (buf[len - 1] == '/') tail_slash = true; size -= len; buf += len; len = m_len - p_len; BUG_ON(len < 0); if (len) { if (m->ns_mountpoint[p_len] == '/') head_slash = true; len = snprintf(buf, size, "%s%s", (!tail_slash && !head_slash) ? "/" : "", m->ns_mountpoint + p_len + (tail_slash && head_slash)); if (len >= size) return -1; } return 0; } static int same_propagation_group(struct mount_info *a, struct mount_info *b) { char root_path_a[PATH_MAX], root_path_b[PATH_MAX]; /* * If mounts are in same propagation group: * 1) Their parents should be different * 2) Their parents should be together in same shared group */ if (!a->parent || !b->parent || a->parent == b->parent || a->parent->shared_id != b->parent->shared_id) return 0; if (root_path_from_parent(a, root_path_a, PATH_MAX)) { pr_err("Failed to get root path for mount %d\n", a->mnt_id); return -1; } if (root_path_from_parent(b, root_path_b, PATH_MAX)) { pr_err("Failed to get root path for mount %d\n", b->mnt_id); return -1; } /* * 3) Their mountpoints relative to the root of the superblock of their * parent's share should be equal */ if (!strcmp(root_path_a, root_path_b)) return 1; return 0; } /* * Note: Only valid if called consequently on all mounts in mntinfo list. * * Note: We may want to iterate over all bindmounts of some mount, and we would * use ->mnt_bind list for this, but iterating over ->mnt_bind list is * obviously meaningless before search_bindmounts had actually put bindmounts * in it. That's why we have ->mnt_bind_is_populated to protect from misuse of * ->mnt_bind. (As ->mnt_bind list can validly be empty when mount has no * bindmounts we need separate field to indicate population.) */ static void __search_bindmounts(struct mount_info *mi) { struct mount_info *t; if (mi->mnt_bind_is_populated) return; for (t = mi->next; t; t = t->next) { if (mounts_sb_equal(mi, t)) { list_add(&t->mnt_bind, &mi->mnt_bind); t->mnt_bind_is_populated = true; pr_debug("\t" "The mount %3d is bind for %3d (@%s -> @%s)\n", t->mnt_id, mi->mnt_id, t->ns_mountpoint, mi->ns_mountpoint); } } mi->mnt_bind_is_populated = true; } static void search_bindmounts(void) { struct mount_info *mi; for (mi = mntinfo; mi; mi = mi->next) __search_bindmounts(mi); } struct mount_info *mnt_bind_pick(struct mount_info *mi, bool (*pick)(struct mount_info *mi, struct mount_info *bind)) { struct mount_info *bind; BUG_ON(!mi); if (pick(mi, mi)) return mi; /* * Shouldn't use mnt_bind list before it was populated in search_bindmounts */ BUG_ON(!mi->mnt_bind_is_populated); list_for_each_entry(bind, &mi->mnt_bind, mnt_bind) if (pick(mi, bind)) return bind; return NULL; } static int resolve_shared_mounts(struct mount_info *info) { struct mount_info *m, *t; /* * If we have a shared mounts, both master * slave targets are to be present in mount * list, otherwise we can't be sure if we can * recreate the scheme later on restore. */ for (m = info; m; m = m->next) { bool need_share, need_master; need_share = m->shared_id && list_empty(&m->mnt_share); need_master = m->master_id; pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", m->mnt_id, m->shared_id, m->master_id, m->ns_mountpoint); for (t = info; t && (need_share || need_master); t = t->next) { if (t == m) continue; if (need_master && t->shared_id == m->master_id) { pr_debug("\t" "The mount %3d is slave for %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, m->ns_mountpoint, t->ns_mountpoint); list_add(&m->mnt_slave, &t->mnt_slave_list); m->mnt_master = t; need_master = false; } /* Collect all mounts from this group */ if (need_share && t->shared_id == m->shared_id) { pr_debug("\t" "Mount %3d is shared with %3d group %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, m->shared_id, t->ns_mountpoint, m->ns_mountpoint); list_add(&t->mnt_share, &m->mnt_share); } } /* * External master detected */ if (need_master) { if ((t = can_receive_master_from_external(m)) || (t = can_receive_master_from_root(m))) { pr_debug("Detected external slavery for %d via %d\n", m->mnt_id, t->mnt_id); if (m != t) list_add(&m->mnt_ext_slave, &t->mnt_ext_slave); continue; } pr_err("Mount %d %s (master_id: %d shared_id: %d) " "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id, m->ns_mountpoint, m->master_id, m->shared_id); return -1; } } /* Search propagation groups */ for (m = info; m; m = m->next) { struct mount_info *sparent; if (!list_empty(&m->mnt_propagate)) continue; if (!m->parent || !m->parent->shared_id) continue; list_for_each_entry(sparent, &m->parent->mnt_share, mnt_share) { struct mount_info *schild; list_for_each_entry(schild, &sparent->children, siblings) { int ret; ret = same_propagation_group(m, schild); if (ret < 0) return -1; else if (ret) { BUG_ON(!mounts_equal(m, schild)); pr_debug("\tMount %3d is in same propagation group with %3d (@%s ~ @%s)\n", m->mnt_id, schild->mnt_id, m->ns_mountpoint, schild->ns_mountpoint); list_add(&schild->mnt_propagate, &m->mnt_propagate); } } } } return 0; } static struct mount_info *mnt_build_tree(struct mount_info *list) { struct mount_info *tree; /* * Organize them in a sequence in which they can be mounted/umounted. */ pr_info("Building mountpoints tree\n"); tree = mnt_build_ids_tree(list); if (!tree) return NULL; resort_siblings(tree, __mnt_resort_children); pr_info("Done:\n"); mnt_tree_show(tree, 0); return tree; } int mnt_is_dir(struct mount_info *pm) { int mntns_root; struct stat st; mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) { pr_warn("Can't get root fd of mntns for %d: %s\n", pm->mnt_id, strerror(errno)); return 0; } if (fstatat(mntns_root, pm->ns_mountpoint, &st, 0)) { pr_warn("Can't fstatat on %s: %s\n", pm->ns_mountpoint, strerror(errno)); return 0; } if (S_ISDIR(st.st_mode)) return 1; return 0; } int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo) { struct stat st; unsigned int dev; int ret; ret = fstat(mnt_fd, &st); if (ret < 0) { pr_perror("fstat(%s) failed", pm->ns_mountpoint); return -1; } if (pm->s_dev_rt == MOUNT_INVALID_DEV) { pr_err("Resolving over invalid device for %#x %s %s\n", pm->s_dev, pm->fstype->name, pm->ns_mountpoint); return -1; } dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); /* * Always check for @s_dev_rt here, because the @s_dev * from the image (in case of restore) has all rights * to not match the device (say it's migrated and kernel * allocates new device ID). */ if (dev != pm->s_dev_rt) { /* * For btrfs device numbers in stat and mountinfo can be * different, fallback to get_sdev_from_fd to get right dev. */ if (!strcmp(pm->fstype->name, "btrfs") && !get_sdev_from_fd(mnt_fd, &dev, parse_mountinfo) && dev == pm->s_dev_rt) return 0; pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, pm->fstype->name, pm->ns_mountpoint); return -1; } return 0; } int check_mountpoint_fd(struct mount_info *pm, int mnt_fd) { return __check_mountpoint_fd(pm, mnt_fd, false); } /* * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. * If mnt_fd is -1, the mountpoint will be opened by this function. */ int __open_mountpoint(struct mount_info *pm) { int mntns_root, mnt_fd; mntns_root = mntns_get_root_fd(pm->nsid); if (mntns_root < 0) return -1; mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); if (mnt_fd < 0) { pr_perror("Can't open %s", pm->ns_mountpoint); return -1; } if (check_mountpoint_fd(pm, mnt_fd)) { close(mnt_fd); return -1; } return mnt_fd; } int open_mount(unsigned int s_dev) { struct mount_info *m; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; return __open_mountpoint(m); } /* Bind-mount a mount point in a temporary place without children */ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root) { char *mnt_path; mnt_path = mkdtemp(mnt_path_tmp); if (mnt_path == NULL && errno == ENOENT) mnt_path = mkdtemp(mnt_path_root); if (mnt_path == NULL) { pr_warn("Can't create a temporary directory: %s\n", strerror(errno)); return NULL; } if (mount(mi->ns_mountpoint, mnt_path, NULL, MS_BIND, NULL)) { pr_perror("Can't bind-mount %d:%s to %s", mi->mnt_id, mi->ns_mountpoint, mnt_path); rmdir(mnt_path); return NULL; } return mnt_path; } static int get_clean_fd(struct mount_info *mi) { char *mnt_path = NULL; char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; int fd; mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root); if (!mnt_path) return -1; fd = open(mnt_path, O_RDONLY | O_DIRECTORY, 0); if (fd < 0) { pr_perror("Can't open directory %s", mnt_path); } else { if (__check_mountpoint_fd(mi, fd, true)) goto err_close; } if (umount2(mnt_path, MNT_DETACH)) { pr_perror("Can't detach mount %s", mnt_path); goto err_close; } if (rmdir(mnt_path)) { pr_perror("Can't remove tmp dir %s", mnt_path); goto err_close; } return fd; err_close: close_safe(&fd); return -1; } /* * Our children mount can have same mountpoint as it's parent, * call these - children-overmount. * Sibling mount's mountpoint can be a subpath of our mountpoint * call these - sibling-overmount. * In both above cases our mountpoint is not visible from the * root of our mount namespace as it is covered by other mount. * mnt_is_overmounted() checks if mount is not visible. */ bool mnt_is_overmounted(struct mount_info *mi) { struct mount_info *t, *c, *m = mi; if (mi->is_overmounted != -1) goto exit; mi->is_overmounted = 0; while (m->parent) { if (mi->parent->is_overmounted == 1) { mi->is_overmounted = 1; goto exit; } /* Check there is no sibling-overmount */ list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { mi->is_overmounted = 1; goto exit; } } /* * If parent has sibling-overmount we are not visible too, * note that children-overmounts for parent are already * checked as our sibling overmounts. */ m = m->parent; } /* Check there is no children-overmount */ list_for_each_entry(c, &mi->children, siblings) if (!strcmp(c->ns_mountpoint, mi->ns_mountpoint)) { mi->is_overmounted = 1; goto exit; } exit: return mi->is_overmounted; } static int __set_is_overmounted(struct mount_info *mi) { /* coverity[check_return] */ mnt_is_overmounted(mi); return 0; } /* * mnt_is_overmounted is intended to detect overmounts in original dumped mount * tree, so we pre-save it just after loading mount tree from images, so that * it does not mess up with any helper mounts or tree changes we can do. */ static void prepare_is_overmounted(void) { struct ns_id *nsid; for (nsid = ns_ids; nsid; nsid = nsid->next) { struct mount_info *root; if (nsid->nd != &mnt_ns_desc) continue; root = nsid->mnt.mntinfo_tree; BUG_ON(root->parent); mnt_tree_for_each(root, __set_is_overmounted); } } /* * __umount_children_overmounts() assumes that the mountpoint and * it's ancestors have no sibling-overmounts, so we can see children * of these mount. Unmount our children-overmounts now. */ static int __umount_children_overmounts(struct mount_info *mi) { struct mount_info *c, *m = mi; /* * Our children-overmount can itself have children-overmount * which covers it, so find deepest children-overmount which * is visible for us now. */ again: list_for_each_entry(c, &m->children, siblings) { if (!strcmp(c->ns_mountpoint, m->ns_mountpoint)) { m = c; goto again; } } /* Unmout children-overmounts in the order of visibility */ while (m != mi) { if (umount2(m->ns_mountpoint, MNT_DETACH)) { pr_perror("Unable to umount child-overmount %s", m->ns_mountpoint); return -1; } BUG_ON(!m->parent); m = m->parent; } return 0; } /* Makes the mountpoint visible except for children-overmounts. */ static int __umount_overmounts(struct mount_info *m) { struct mount_info *t, *ovm; int ovm_len, ovm_len_min = 0; /* Root mount has no sibling-overmounts */ if (!m->parent) return 0; /* * If parent is sibling-overmounted we are not visible * too, so first try to unmount overmounts for parent. */ if (__umount_overmounts(m->parent)) return -1; /* Unmount sibling-overmounts in visibility order */ next: ovm = NULL; ovm_len = strlen(m->ns_mountpoint) + 1; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { int t_len = strlen(t->ns_mountpoint); if (t_len < ovm_len && t_len > ovm_len_min) { ovm = t; ovm_len = t_len; } } } if (ovm) { ovm_len_min = ovm_len; /* Our sibling-overmount can have children-overmount covering it */ if (__umount_children_overmounts(ovm)) return -1; if (umount2(ovm->ns_mountpoint, MNT_DETACH)) { pr_perror("Unable to umount %s", ovm->ns_mountpoint + 1); return -1; } goto next; } return 0; } /* Make our mountpoint fully visible */ static int umount_overmounts(struct mount_info *m) { if (__umount_overmounts(m)) return -1; if (__umount_children_overmounts(m)) return -1; return 0; } struct clone_arg { struct mount_info *mi; int *fd; }; /* * Get access to the mountpoint covered by overmounts * and open it's cleaned copy (without children mounts). */ int ns_open_mountpoint(void *arg) { struct clone_arg *ca = arg; struct mount_info *mi = ca->mi; int *fd = ca->fd; /* * We should enter user namespace owning mount namespace of our mount * before creating helper mount namespace. Else all mounts in helper * mount namespace will be locked (MNT_LOCKED) and we won't be able to * unmount them (see CL_UNPRIVILEGED in sys_umount(), clone_mnt() and * copy_mnt_ns() in linux kernel code). */ if ((root_ns_mask & CLONE_NEWUSER) && switch_ns(root_item->pid->real, &user_ns_desc, NULL) < 0) goto err; /* * Create a helper mount namespace in which we can safely do unmounts * without breaking dumping process' environment. */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to unshare a mount namespace"); goto err; } /* Remount all mounts as private to disable propagation */ if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { pr_perror("Unable to remount"); goto err; } if (umount_overmounts(mi)) goto err; /* * Save fd which we opened for parent due to CLONE_FILES flag * * Mount can still have children in it, but we don't need to clean it * explicitly as when last process exits mntns all mounts in it are * cleaned from their children, and we are exactly the last process. */ *fd = open(mi->ns_mountpoint, O_DIRECTORY | O_RDONLY); if (*fd < 0) { pr_perror("Unable to open %s(%d)", mi->ns_mountpoint, mi->mnt_id); goto err; } if (__check_mountpoint_fd(mi, *fd, true)) { close(*fd); goto err; } return 0; err: return 1; } int open_mountpoint(struct mount_info *pm) { int fd = -1, cwd_fd, ns_old = -1; /* No overmounts and children - the entire mount is visible */ if (list_empty(&pm->children) && !mnt_is_overmounted(pm)) return __open_mountpoint(pm); pr_info("Mount is not fully visible %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); /* * We do two things below: * a) If mount has children mounts in it which partially cover it's * content, to get access to the content we create a "private" copy of * such a mount, bind-mounting mount w/o MS_REC in a temporary place. * b) If mount is overmounted we create a private copy of it's mount * namespace so that we can safely get rid of overmounts and get an * access to the mount. * In both cases we can't do the thing from criu's mount namespace, so * we need to switch to mount's mount namespace, and later switch back. */ if (switch_mnt_ns(pm->nsid->ns_pid, &ns_old, &cwd_fd) < 0) goto err; if (!mnt_is_overmounted(pm)) { pr_info("\tmount has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); fd = get_clean_fd(pm); } /* * Mount is overmounted or probably we can't create a temporary * directory for a cleaned mount */ if (fd < 0) { int pid, status; struct clone_arg ca = { .mi = pm, .fd = &fd }; pr_info("\tmount is overmounted or has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); /* * We are overmounted - not accessible in a regular way. We * need to clone "private" copy of mount's monut namespace and * unmount all covering overmounts in it. We also need to enter * user namespace owning these mount namespace just before that * (see explanation in ns_open_mountpoint). Thus we also have * to create helper process here as entering user namespace is * irreversible operation. */ pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, &ca); if (pid == -1) { pr_perror("Can't clone helper process"); goto err; } errno = 0; if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status) || WEXITSTATUS(status)) { pr_err("Can't wait or bad status: errno=%d, status=%d\n", errno, status); goto err; } } if (restore_mnt_ns(ns_old, &cwd_fd)) { ns_old = -1; goto err; } return fd < 0 ? __open_mountpoint(pm) : fd; err: if (ns_old >= 0) /* coverity[check_return] */ restore_mnt_ns(ns_old, &cwd_fd); close_safe(&fd); return -1; } /* * Helper for getting a path to mount's plain mountpoint */ char *get_plain_mountpoint(int mnt_id, char *name) { static char tmp[PATH_MAX]; int ret; if (!mnt_roots) return NULL; if (name) ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%s", mnt_roots, name); else ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%010d", mnt_roots, mnt_id); if (ret >= sizeof(tmp)) return NULL; return xstrdup(tmp); } struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev, bool rst) { struct mount_info *mi, *t, *parent; bool add_slash = false; int len; mi = mnt_entry_alloc(rst); if (!mi) return NULL; len = strlen(root->mountpoint); /* It may be "./" or "./path/to/dir" */ if (root->mountpoint[len - 1] != '/') { add_slash = true; len++; } mi->mountpoint = xmalloc(len + strlen(path) + 1); if (!mi->mountpoint) goto err; if (!rst) mi->ns_mountpoint = mi->mountpoint; if (!add_slash) sprintf(mi->mountpoint, "%s%s", root->mountpoint, path); else sprintf(mi->mountpoint, "%s/%s", root->mountpoint, path); if (rst) { mi->plain_mountpoint = get_plain_mountpoint(-1, "crtime"); if (!mi->plain_mountpoint) goto err; } mi->mnt_id = HELPER_MNT_ID; mi->is_dir = true; mi->flags = mi->sb_flags = 0; mi->root = xstrdup("/"); mi->fsname = xstrdup(fsname); mi->source = xstrdup(fsname); mi->options = xstrdup(""); if (!mi->root || !mi->fsname || !mi->source || !mi->options) goto err; mi->fstype = find_fstype_by_name(fsname); mi->s_dev = mi->s_dev_rt = s_dev; parent = root; while (1) { list_for_each_entry(t, &parent->children, siblings) { if (strstartswith(service_mountpoint(mi), service_mountpoint(t))) { parent = t; break; } } if (&t->siblings == &parent->children) break; } mi->mnt_bind_is_populated = true; mi->is_overmounted = false; mi->nsid = parent->nsid; mi->parent = parent; mi->parent_mnt_id = parent->mnt_id; list_add(&mi->siblings, &parent->children); pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", service_mountpoint(mi), service_mountpoint(parent), parent->mnt_id); return mi; err: mnt_entry_free(mi); return NULL; } /* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, const char *target, const char *type) { int mnt_fd, cwd_fd, ret, exit_code = 0; struct stat st; ret = switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd); if (ret < 0) { pr_err("Can't switch mnt_ns\n"); goto out; } ret = mount(source, target, type, 0, NULL); if (ret < 0) { pr_perror("Unable to mount %s %s", source, target); exit_code = -errno; goto restore_ns; } else { if (stat(target, &st) < 0) { pr_perror("Can't stat %s", target); exit_code = 0; } else { *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); exit_code = 1; } } restore_ns: ret = restore_mnt_ns(mnt_fd, &cwd_fd); out: return ret < 0 ? 0 : exit_code; } static int dump_one_fs(struct mount_info *mi) { struct mount_info *pm = mi; struct mount_info *t; bool first = true; if (mnt_is_root_bind(mi) || mi->need_plugin || mnt_is_external_bind(mi) || !mi->fstype->dump) return 0; /* mnt_bind is a cycled list, so list_for_each can't be used here. */ for (; &pm->mnt_bind != &mi->mnt_bind || first; pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) { int ret; first = false; if (!fsroot_mounted(pm)) continue; ret = pm->fstype->dump(pm); if (ret == MNT_UNREACHABLE) continue; if (ret < 0) return ret; pm->dumped = true; list_for_each_entry(t, &pm->mnt_bind, mnt_bind) t->dumped = true; return 0; } pr_err("Unable to dump a file system for %d:%s\n", mi->mnt_id, mi->ns_mountpoint); return -1; } static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) { MntEntry me = MNT_ENTRY__INIT; pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, pm->root, pm->ns_mountpoint); me.fstype = pm->fstype->code; if (me.fstype == FSTYPE__AUTO) me.fsname = pm->fsname; if (!pm->dumped && dump_one_fs(pm)) return -1; if (!mnt_is_external_bind(pm) && !fsroot_mounted(pm) && pm->fstype->check_bindmount && pm->fstype->check_bindmount(pm)) return -1; if (pm->mnt_id == HELPER_MNT_ID) { pr_info("Skip dumping helper mountpoint: %s\n", pm->ns_mountpoint); return 0; } me.mnt_id = pm->mnt_id; me.root_dev = pm->s_dev; me.parent_mnt_id = pm->parent_mnt_id; me.flags = pm->flags; me.sb_flags = pm->sb_flags; me.has_sb_flags = true; me.mountpoint = pm->ns_mountpoint + 1; me.source = pm->source; me.options = pm->options; me.shared_id = pm->shared_id; me.has_shared_id = true; me.master_id = pm->master_id; me.has_master_id = true; if (pm->need_plugin) { me.has_with_plugin = true; me.with_plugin = true; } if (pm->deleted) { me.has_deleted = true; me.deleted = true; } if (pm->internal_sharing) { me.has_internal_sharing = true; me.internal_sharing = true; } if (pm->external) /* * For external mount points dump the mapping's * value, see collect_mnt_from_image -> get_mp_root * for reverse mapping details. */ me.ext_key = pm->external; me.root = pm->root; if (pb_write_one(img, &me, PB_MNT)) return -1; return 0; } static void free_mntinfo(struct mount_info *pms) { while (pms) { struct mount_info *pm; pm = pms->next; mnt_entry_free(pms); pms = pm; } } struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump) { struct mount_info *pm; pm = parse_mountinfo(ns->ns_pid, ns, for_dump); if (!pm) { pr_err("Can't parse %d's mountinfo\n", ns->ns_pid); return NULL; } ns->mnt.mntinfo_tree = mnt_build_tree(pm); if (ns->mnt.mntinfo_tree == NULL) goto err; ns->mnt.mntinfo_list = pm; return pm; err: free_mntinfo(pm); return NULL; } static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms) { struct mount_info *pm; int ret = -1; struct cr_img *img; unsigned int ns_id = ns->id; pr_info("Dumping mountpoints\n"); img = open_image(CR_FD_MNTS, O_DUMP, ns_id); if (!img) goto err; for (pm = pms; pm && pm->nsid == ns; pm = pm->next) if (dump_one_mountpoint(pm, img)) goto err_i; ret = 0; err_i: close_image(img); err: return ret; } /* * _fn_f - pre-order traversal function * _fn_r - post-order traversal function * _plist - a postpone list. _el is added to this list, if _fn_f returns * a positive value, and all lower elements are not enumerated. */ #define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) \ do { \ struct mount_info *_mi = _r; \ \ while (1) { \ int ret; \ \ list_del_init(&_mi->postpone); \ \ ret = _fn_f(_mi); \ if (ret < 0) \ return -1; \ else if (ret > 0) { \ list_add_tail(&_mi->postpone, _plist); \ goto up; \ } \ \ _prgs++; \ \ if (!list_empty(&_mi->children)) { \ _mi = list_entry(_mi->children._el, struct mount_info, siblings); \ continue; \ } \ up: \ if (_fn_r(_mi)) \ return -1; \ if (_mi == _r) \ break; \ if (_mi->siblings._el == &_mi->parent->children) { \ _mi = _mi->parent; \ goto up; \ } \ _mi = list_entry(_mi->siblings._el, struct mount_info, siblings); \ } \ } while (0) #define MNT_WALK_NONE 0 && int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)) { struct mount_info *tmp; LIST_HEAD(postpone); LIST_HEAD(postpone2); int progress; pr_debug("Start with %d:%s\n", start->mnt_id, start->ns_mountpoint); list_add(&start->postpone, &postpone); again: progress = 0; list_for_each_entry_safe(start, tmp, &postpone, postpone) MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress); if (!progress) { struct mount_info *m; pr_err("A few mount points can't be mounted\n"); list_for_each_entry(m, &postpone2, postpone) { pr_err("%d:%d %s %s %s\n", m->mnt_id, m->parent_mnt_id, m->root, m->ns_mountpoint, m->source); } return -1; } list_splice_init(&postpone2, &postpone); if (!list_empty(&postpone)) goto again; return 0; } static int mnt_tree_for_each_reverse(struct mount_info *m, int (*fn)(struct mount_info *)) { int progress = 0; MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *)NULL, progress); (void)progress; // Suppress -Wused-but-unset-variable for clang>=15 return 0; } char *resolve_source(struct mount_info *mi) { if (kdev_major(mi->s_dev) == 0) /* * Anonymous block device. Kernel creates them for * diskless mounts. */ return mi->source; /* * FSTYPE__AUTO check is a fallback for old images which do not have * explicit EXTERNAL_DEV_MOUNT mark, but still have "dev[key]" in source. */ if (mnt_is_dev_external(mi) || mi->fstype->code == FSTYPE__AUTO) { struct stat st; char *val; val = external_lookup_by_key(mi->source); if (!IS_ERR_OR_NULL(val)) return val; if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) && major(st.st_rdev) == kdev_major(mi->s_dev) && minor(st.st_rdev) == kdev_minor(mi->s_dev)) return mi->source; } pr_err("No device for %s(%d) mount\n", mi->ns_mountpoint, mi->mnt_id); return NULL; } static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave) { pr_debug("%d:%s private %d shared %d slave %d\n", mi->mnt_id, service_mountpoint(mi), private, shared, slave); if (mi->flags & MS_UNBINDABLE) { if (shared || slave) { pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", service_mountpoint(mi)); } else { if (!mnt_is_overmounted(mi)) { /* Someone may still want to bind from us, let them do it. */ pr_debug("Temporary leave unbindable mount %s as private\n", service_mountpoint(mi)); if (mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %d private", mi->mnt_id); return -1; } list_add(&mi->mnt_unbindable, &delayed_unbindable); return 0; } if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { pr_perror("Unable to make %d unbindable", mi->mnt_id); return -1; } return 0; } } if (private && mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %d private", mi->mnt_id); return -1; } if (slave && mount(NULL, service_mountpoint(mi), NULL, MS_SLAVE, NULL)) { pr_perror("Unable to make %d slave", mi->mnt_id); return -1; } if (shared && mount(NULL, service_mountpoint(mi), NULL, MS_SHARED, NULL)) { pr_perror("Unable to make %d shared", mi->mnt_id); return -1; } return 0; } /* * Umount points, which are propagated in slave parents, because * we can't be sure, that they were inherited in a real life. */ static int umount_from_slaves(struct mount_info *mi) { struct mount_info *t; char *mpath, buf[PATH_MAX]; BUG_ON(mi->parent == root_yard_mp); list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) { if (!t->mounted) continue; mpath = mnt_get_sibling_path(mi, t, buf, sizeof(buf)); if (mpath == NULL) continue; pr_debug("\t\tUmount slave %s\n", mpath); if (umount(mpath) == -1) { pr_perror("Can't umount slave %s", mpath); return -1; } } return 0; } /* * If something is mounted in one shared point, it will be spread in * all other points from this shared group. * * Look at Documentation/filesystems/sharedsubtree.txt for more details */ static int propagate_siblings(struct mount_info *mi) { struct mount_info *t; /* * Find all mounts, which must be bind-mounted from this one * to inherit shared group or master id */ list_for_each_entry(t, &mi->mnt_share, mnt_share) { if (t->mounted) continue; if (t->bind && t->bind->shared_id == t->shared_id) continue; pr_debug("\t\tBind share %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) { if (t->mounted || t->bind) continue; pr_debug("\t\tBind slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } list_for_each_entry(t, &mi->mnt_ext_slave, mnt_ext_slave) { if (t->mounted || t->bind) continue; pr_debug("\t\tBind ext-slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } return 0; } static int propagate_mount(struct mount_info *mi) { struct mount_info *p; propagate_siblings(mi); if (!mi->parent || mi->parent == root_yard_mp) goto skip_parent; umount_from_slaves(mi); /* Mark mounts in propagation group mounted */ list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { /* Should not propagate the same mount twice */ BUG_ON(p->mounted); pr_debug("\t\tPropagate %s(%d)\n", p->ns_mountpoint, p->mnt_id); /* * When a mount is propagated, the result mount * is always shared. If we want to get a private * mount, we need to convert it. */ restore_shared_options(p, !p->shared_id, 0, 0); p->mounted = true; propagate_siblings(p); umount_from_slaves(p); } skip_parent: /* * FIXME Currently non-root mounts can be restored * only if a proper root mount exists */ if (fsroot_mounted(mi) || mi->parent == root_yard_mp || mi->external) { struct mount_info *t; list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { if (t->mounted) continue; if (t->bind) continue; if (t->master_id) continue; if (!issubpath(t->root, mi->root)) continue; pr_debug("\t\tBind private %s(%d)\n", t->ns_mountpoint, t->mnt_id); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } } return 0; } int fetch_rt_stat(struct mount_info *m, const char *where) { struct stat st; if (stat(where, &st)) { pr_perror("Can't stat on %s", where); return -1; } m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev)); return 0; } int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) { int ret = mount(src, service_mountpoint(mi), fstype, mountflags, mi->options); if (ret) pr_perror("Unable to mount %s %s (id=%d)", src, service_mountpoint(mi), mi->mnt_id); return ret; } char *mnt_fsname(struct mount_info *mi) { if (mi->fstype->code == FSTYPE__AUTO) return mi->fsname; return mi->fstype->name; } static int userns_mount(char *src, void *args, int fd, pid_t pid) { unsigned long flags = *(unsigned long *)args; int rst = -1, err = -1; char target[PSFDS]; snprintf(target, sizeof(target), "/proc/self/fd/%d", fd); if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst)) return -1; err = mount(src, target, NULL, flags, NULL); if (err) pr_perror("Unable to mount %s", target); if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) return -1; return err; } int apply_sb_flags(void *args, int fd, pid_t pid) { return userns_mount(NULL, args, fd, pid); } int mount_root(void *args, int fd, pid_t pid) { return userns_mount(opts.root, args, fd, pid); } static int do_new_mount(struct mount_info *mi) { unsigned long sflags = mi->sb_flags; unsigned long mflags = mi->flags & (~MS_PROPAGATE); char *src; struct fstype *tp = mi->fstype; bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; src = resolve_source(mi); if (!src) return -1; /* Merge superblock and mount flags if it's possible */ if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { sflags |= mflags; mflags = 0; } if (remount_ro) sflags &= ~MS_RDONLY; if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { pr_perror("Can't mount at %s", service_mountpoint(mi)); return -1; } if (tp->restore && tp->restore(mi)) return -1; if (remount_ro) { int fd; fd = open(service_mountpoint(mi), O_PATH); if (fd < 0) { pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } sflags |= MS_RDONLY | MS_REMOUNT; if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { pr_err("Unable to apply mount flags %d for %s\n", mi->sb_flags, service_mountpoint(mi)); close(fd); return -1; } close(fd); } if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { pr_perror("Unable to apply bind-mount options"); return -1; } /* * A slave should be mounted from do_bind_mount(). * Look at can_mount_now() for details. */ BUG_ON(mi->master_id); if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0)) return -1; mi->mounted = true; return 0; } int restore_ext_mount(struct mount_info *mi) { int ret; pr_debug("Restoring external bind mount %s\n", service_mountpoint(mi)); ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, service_mountpoint(mi), "/", NULL); if (ret) pr_err("Can't restore ext mount (%d)\n", ret); return ret; } static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX"; static int mount_clean_path(void) { /* * To make a bind mount, we need to have access to a source directory, * which can be over-mounted. The idea is to mount a source mount in * an intermediate place without MS_REC and then create a target mounts. * This intermediate place should be a private mount to not affect * properties of the source mount. */ if (mkdtemp(mnt_clean_path) == NULL) { pr_perror("Unable to create a temporary directory"); return -1; } if (mount(mnt_clean_path, mnt_clean_path, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount tmpfs into %s", mnt_clean_path); return -1; } if (mount(NULL, mnt_clean_path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to mark %s as private", mnt_clean_path); return -1; } return 0; } static int umount_clean_path(void) { if (umount2(mnt_clean_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_clean_path); return -1; } if (rmdir(mnt_clean_path)) { pr_perror("Unable to remove %s", mnt_clean_path); } return 0; } static int do_bind_mount(struct mount_info *mi) { char mnt_fd_path[PSFDS]; char *root, *cut_root, rpath[PATH_MAX]; unsigned long mflags; int exit_code = -1, mp_len; bool shared = false; bool master = false; bool priv = false; char *mnt_path = NULL; struct stat st; bool umount_mnt_path = false; struct mount_info *c; if (mi->need_plugin) { if (restore_ext_mount(mi)) return -1; goto out; } if (mnt_is_nodev_external(mi)) { /* * We have / pointing to criu's ns root still, * so just use the mapping's path. The mountpoint * is tuned in collect_mnt_from_image to refer * to proper location in the namespace we restore. */ root = mi->external; priv = !mi->master_id && (mi->internal_sharing || !mi->shared_id); goto do_bind; } shared = mi->shared_id && mi->shared_id == mi->bind->shared_id; master = mi->master_id && mi->master_id == mi->bind->master_id; priv = !mi->master_id && !shared; cut_root = cut_root_for_bind(mi->root, mi->bind->root); /* Mount private can be initialized on mount() callback, which is * called only once. * It have to be copied to all it's sibling structures to provide users * of it with actual data. */ mi->private = mi->bind->private; mnt_path = service_mountpoint(mi->bind); /* Access a mount by fd if service_mountpoint(mi->bind) is overmounted */ if (mi->bind->fd >= 0) { snprintf(mnt_fd_path, sizeof(mnt_fd_path), "/proc/self/fd/%d", mi->bind->fd); mnt_path = mnt_fd_path; } if (cut_root[0] == 0) /* This case is handled by mi->bind->fd */ goto skip_overmount_check; /* * The target path may be over-mounted by one of child mounts * and we need to create a new bind-mount to get access to the path. */ mp_len = strlen(service_mountpoint(mi->bind)); if (mp_len > 1) /* skip a joining / if service_mountpoint(mi->bind) isn't "/" */ mp_len++; list_for_each_entry(c, &mi->bind->children, siblings) { if (!c->mounted) continue; if (issubpath(cut_root, service_mountpoint(c) + mp_len)) break; /* a source path is overmounted */ } if (&c->siblings != &mi->bind->children) { /* Get a copy of mi->bind without child mounts */ if (mount(mnt_path, mnt_clean_path, NULL, MS_BIND, NULL)) { pr_perror("Unable to bind-mount %s to %s", mnt_path, mnt_clean_path); return -1; } mnt_path = mnt_clean_path; umount_mnt_path = true; } if (mnt_path == NULL) return -1; skip_overmount_check: snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); root = rpath; do_bind: pr_info("\tBind %s to %s\n", root, service_mountpoint(mi)); if (unlikely(mi->deleted)) { if (stat(service_mountpoint(mi), &st)) { pr_perror("Can't fetch stat on %s", service_mountpoint(mi)); goto err; } if (S_ISDIR(st.st_mode)) { if (mkdir(root, (st.st_mode & ~S_IFMT))) { pr_perror("Can't re-create deleted directory %s", root); goto err; } } else if (S_ISREG(st.st_mode)) { int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, st.st_mode & ~S_IFMT); if (fd < 0) { pr_perror("Can't re-create deleted file %s", root); goto err; } close(fd); } else { pr_err("Unsupported st_mode 0%o deleted root %s\n", (int)st.st_mode, root); goto err; } } if (mount(root, service_mountpoint(mi), NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { pr_perror("Can't bind-mount at %s", service_mountpoint(mi)); goto err; } mflags = mi->flags & (~MS_PROPAGATE); if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) if (mount(NULL, service_mountpoint(mi), NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { pr_perror("Can't re-mount at %s", service_mountpoint(mi)); goto err; } if (unlikely(mi->deleted)) { if (S_ISDIR(st.st_mode)) { if (rmdir(root)) { pr_perror("Can't remove deleted directory %s", root); goto err; } } else if (S_ISREG(st.st_mode)) { if (unlink(root)) { pr_perror("Can't unlink deleted file %s", root); goto err; } } } out: /* * shared - the mount is in the same shared group with mi->bind * mi->shared_id && !shared - create a new shared group */ if (restore_shared_options(mi, priv, mi->shared_id && !shared, mi->master_id && !master)) goto err; mi->mounted = true; exit_code = 0; err: if (umount_mnt_path) { /* * If mnt_path was shared, a new mount may be propagated * into it. */ if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %s private", mnt_path); return -1; } if (umount2(mnt_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_path); return -1; } } return exit_code; } static bool can_mount_now(struct mount_info *mi) { struct mount_info *ext; if (rst_mnt_is_root(mi)) { pr_debug("%s: true as %d is mntns root\n", __func__, mi->mnt_id); return true; } /* Parent should be mounted already, that's how mnt_tree_for_each works */ BUG_ON(mi->parent && !mi->parent->mounted); if (mnt_is_nodev_external(mi)) goto shared; if (!mi->bind && !mi->external && (ext = mnt_get_external_bind(mi)) && !has_mounted_external_bind(mi)) { pr_debug("%s: false as %d's external %d is not mounted\n", __func__, mi->mnt_id, ext->mnt_id); return false; } /* * We're the slave peer: * - Make sure the master peer is already mounted * - Make sure all children of master's share are * mounted as well to eliminate mounts duplications */ if (mi->mnt_master) { struct mount_info *c, *s; if (mi->bind == NULL) { pr_debug("%s: false as %d is slave with unmounted master %d\n", __func__, mi->mnt_id, mi->mnt_master->mnt_id); return false; } list_for_each_entry(c, &mi->mnt_master->children, siblings) { if (!c->mounted) { pr_debug("%s: false as %d is slave with unmounted master's children %d\n", __func__, mi->mnt_id, c->mnt_id); return false; } } list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) { list_for_each_entry(c, &s->children, siblings) { if (!c->mounted) { pr_debug("%s: false as %d is slave with unmounted children of master's share\n", __func__, mi->mnt_id); return false; } } } } if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) { pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); return false; } shared: /* Mount only after all parents of our propagation group mounted */ if (!list_empty(&mi->mnt_propagate)) { struct mount_info *p; list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { BUG_ON(!p->parent); if (!p->parent->mounted) { pr_debug("%s: false as %d has unmounted parent %d of its propagation group\n", __func__, mi->mnt_id, p->parent->mnt_id); return false; } } } /* * Mount only after all children of share, which shouldn't * (but can if wrong order) propagate to us, are mounted */ if (mi->shared_id) { struct mount_info *s, *c, *p, *t; LIST_HEAD(mi_notprop); bool can = true; /* Add all children of the shared group */ list_for_each_entry(s, &mi->mnt_share, mnt_share) { list_for_each_entry(c, &s->children, siblings) { char root_path[PATH_MAX]; int ret; ret = root_path_from_parent(c, root_path, PATH_MAX); BUG_ON(ret); /* Mount is out of our root */ if (!issubpath(root_path, mi->root)) continue; list_add(&c->mnt_notprop, &mi_notprop); } } /* Delete all members of our children's propagation groups */ list_for_each_entry(c, &mi->children, siblings) { list_for_each_entry(p, &c->mnt_propagate, mnt_propagate) { list_del_init(&p->mnt_notprop); } } /* Delete all members of our propagation group */ list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { list_del_init(&p->mnt_notprop); } /* Delete self */ list_del_init(&mi->mnt_notprop); /* Check not propagated mounts mounted and cleanup list */ list_for_each_entry_safe(p, t, &mi_notprop, mnt_notprop) { if (!p->mounted) { pr_debug("%s: false as %d has unmounted 'anti'-propagation mount %d\n", __func__, mi->mnt_id, p->mnt_id); can = false; } list_del_init(&p->mnt_notprop); } if (!can) return false; } return true; } static int do_mount_root(struct mount_info *mi) { if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; return fetch_rt_stat(mi, service_mountpoint(mi)); } static int do_close_one(struct mount_info *mi) { close_safe(&mi->fd); return 0; } static int set_unbindable(struct mount_info *mi) { if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { pr_perror("Failed setting unbindable flag on %d", mi->mnt_id); return -1; } return 0; } static int do_mount_one(struct mount_info *mi) { int ret; if (mi->mounted) return 0; if (!can_mount_now(mi)) { pr_debug("Postpone mount %s(%d)\n", mi->ns_mountpoint, mi->mnt_id); return 1; } if ((mi->parent && mi->parent != root_yard_mp) && !strcmp(mi->parent->ns_mountpoint, mi->ns_mountpoint)) { mi->parent->fd = open(service_mountpoint(mi->parent), O_PATH); if (mi->parent->fd < 0) { pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } } pr_debug("\tMounting %s %d@%s (%d)\n", mi->fstype->name, mi->mnt_id, service_mountpoint(mi), mi->need_plugin); if (rst_mnt_is_root(mi)) { int fd; unsigned long flags = MS_BIND | MS_REC; if (opts.root == NULL) { pr_err("The --root option is required to restore a mount namespace\n"); return -1; } /* do_mount_root() is called from populate_mnt_ns() */ if (root_ns_mask & CLONE_NEWUSER) { fd = open(service_mountpoint(mi), O_PATH); if (fd < 0) { pr_perror("Unable to open %s", service_mountpoint(mi)); return -1; } if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { pr_err("Unable to mount %s\n", service_mountpoint(mi)); close(fd); return -1; } close(fd); } else { if (mount(opts.root, service_mountpoint(mi), NULL, flags, NULL)) { pr_perror("Unable to mount %s %s (id=%d)", opts.root, service_mountpoint(mi), mi->mnt_id); return -1; } } if (do_mount_root(mi)) return -1; mi->mounted = true; ret = 0; } else if (!mi->bind && !mi->need_plugin && !mnt_is_nodev_external(mi)) { ret = do_new_mount(mi); } else { ret = do_bind_mount(mi); } if (ret == 0 && fetch_rt_stat(mi, service_mountpoint(mi))) return -1; if (ret == 0 && propagate_mount(mi)) return -1; if (mi->fstype->code == FSTYPE__UNSUPPORTED) { struct statfs st; if (statfs(service_mountpoint(mi), &st)) { pr_perror("Unable to statfs %s", service_mountpoint(mi)); return -1; } if (st.f_type == BTRFS_SUPER_MAGIC) mi->fstype = find_fstype_by_name("btrfs"); } return ret; } static int do_umount_one(struct mount_info *mi) { if (!mi->parent) return 0; if (mount("none", service_mountpoint(mi->parent), "none", MS_REC | MS_PRIVATE, NULL)) { pr_perror("Can't mark %s as private", service_mountpoint(mi->parent)); return -1; } if (umount(service_mountpoint(mi))) { pr_perror("Can't umount at %s", service_mountpoint(mi)); return -1; } pr_info("Umounted at %s\n", service_mountpoint(mi)); return 0; } /* * If a mount overmounts other mounts, it is restored separately in the roots * yard and then moved to the right place. * * mnt_remap_entry is created for each such mount and it's added into * mnt_remap_list. The origin mount point is replaced on a new one in * roots_yard where it will be restored. The remapped mount will be * moved to the right places after restoring all mounts. */ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { struct mount_info *mi; /* child is remaped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; static int do_remap_mount(struct mount_info *m) { int len; /* A path in root_yard has a fixed size, so it can be replaced. */ len = print_ns_root(m->nsid, remap_id, m->mountpoint, PATH_MAX); m->mountpoint[len] = '/'; return 0; } static int try_remap_mount(struct mount_info *m) { struct mnt_remap_entry *r; if (!mnt_needs_remap(m)) return 0; BUG_ON(!m->parent); r = xmalloc(sizeof(struct mnt_remap_entry)); if (!r) return -1; r->mi = m; list_add_tail(&r->node, &mnt_remap_list); return 0; } static int find_remap_mounts(struct mount_info *root) { struct mnt_remap_entry *r; struct mount_info *m; /* * It's impossible to change a tree without interrupting * enumeration, so on the first step mounts are added * into mnt_remap_list and then they are connected to root_yard_mp. */ if (mnt_tree_for_each(root, try_remap_mount)) return -1; /* Move remapped mounts to root_yard */ list_for_each_entry(r, &mnt_remap_list, node) { m = r->mi; r->parent = m->parent; m->parent = root_yard_mp; list_del(&m->siblings); list_add(&m->siblings, &root_yard_mp->children); remap_id++; mnt_tree_for_each(m, do_remap_mount); pr_debug("Restore the %d mount in %s\n", m->mnt_id, m->mountpoint); } return 0; } /* Move remapped mounts to places where they have to be */ static int fixup_remap_mounts(void) { struct mnt_remap_entry *r; list_for_each_entry(r, &mnt_remap_list, node) { struct mount_info *m = r->mi; char path[PATH_MAX]; int len; strncpy(path, m->mountpoint, PATH_MAX - 1); path[PATH_MAX - 1] = 0; len = print_ns_root(m->nsid, 0, path, PATH_MAX); path[len] = '/'; pr_debug("Move mount %s -> %s\n", m->mountpoint, path); if (mount(m->mountpoint, path, NULL, MS_MOVE, NULL)) { pr_perror("Unable to move mount %s -> %s", m->mountpoint, path); return -1; } /* Insert child back to its place in the tree */ list_del(&r->mi->siblings); list_add(&r->mi->siblings, &r->parent->children); r->mi->parent = r->parent; } return 0; } int cr_pivot_root(char *root) { char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX"; bool tmp_dir = false; char *put_root = "tmp"; int exit_code = -1; struct stat st; pr_info("Move the root to %s\n", root ?: "."); if (root) { if (chdir(root)) { pr_perror("chdir(%s) failed", root); return -1; } } if (stat(put_root, &st) || !S_ISDIR(st.st_mode)) { put_root = mkdtemp(tmp_dir_tmpl); if (put_root == NULL) { pr_perror("Can't create a temporary directory"); return -1; } tmp_dir = true; } if (mount(put_root, put_root, NULL, MS_BIND, NULL)) { pr_perror("Unable to mount tmpfs in %s", put_root); goto err_root; } if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) { pr_perror("Can't remount %s with MS_PRIVATE", put_root); goto err_tmpfs; } if (pivot_root(".", put_root)) { pr_perror("pivot_root(., %s) failed", put_root); goto err_tmpfs; } if (mount("none", put_root, "none", MS_REC | MS_SLAVE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); return -1; } exit_code = 0; if (umount2(put_root, MNT_DETACH)) { pr_perror("Can't umount %s", put_root); return -1; } err_tmpfs: if (umount2(put_root, MNT_DETACH)) { pr_perror("Can't umount %s", put_root); return -1; } err_root: if (tmp_dir && rmdir(put_root)) { pr_perror("Can't remove the directory %s", put_root); return -1; } return exit_code; } struct mount_info *mnt_entry_alloc(bool rst) { struct mount_info *new; /* * We rely on xzalloc here for MOUNT_INVALID_DEV. */ BUILD_BUG_ON(MOUNT_INVALID_DEV); new = xzalloc(sizeof(struct mount_info)); if (new) { if (rst) { new->rmi = shmalloc(sizeof(struct rst_mount_info)); if (!new->rmi) { xfree(new); return NULL; } memset(new->rmi, 0, sizeof(struct rst_mount_info)); } new->mp_fd_id = -1; new->mnt_fd_id = -1; new->is_dir = -1; new->fd = -1; new->is_overmounted = -1; INIT_LIST_HEAD(&new->children); INIT_LIST_HEAD(&new->siblings); INIT_LIST_HEAD(&new->mnt_slave_list); INIT_LIST_HEAD(&new->mnt_ext_slave); INIT_LIST_HEAD(&new->mnt_share); INIT_LIST_HEAD(&new->mnt_bind); INIT_LIST_HEAD(&new->mnt_propagate); INIT_LIST_HEAD(&new->mnt_notprop); INIT_LIST_HEAD(&new->mnt_unbindable); INIT_LIST_HEAD(&new->postpone); INIT_LIST_HEAD(&new->deleted_list); } return new; } void mnt_entry_free(struct mount_info *mi) { if (mi) { xfree(mi->root); xfree(mi->mountpoint); xfree(mi->plain_mountpoint); xfree(mi->source); xfree(mi->options); xfree(mi->fsname); xfree(mi); } } /* * Helper for getting a path to where the namespace's root * is re-constructed. */ int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) { return snprintf(buf, bs, "%s/%d-%010d", mnt_roots, ns->id, remap_id); } static int create_mnt_roots(void) { int exit_code = -1; if (mnt_roots) return 0; mnt_roots = xstrdup("/tmp/.criu.mntns.XXXXXX"); if (mnt_roots == NULL) goto out; if (mkdtemp(mnt_roots) == NULL) { pr_perror("Unable to create a temporary directory"); mnt_roots = NULL; goto out; } chmod(mnt_roots, 0777); exit_code = 0; out: return exit_code; } static int get_mp_root(MntEntry *me, struct mount_info *mi) { char *ext = NULL; BUG_ON(me->ext_mount && me->ext_key); /* Forward compatibility fixup */ if (me->ext_mount) { me->ext_key = me->root; /* * Putting the id of external mount which is provided by user, * to ->root can confuse mnt_is_external_bind and other functions * which expect to see the path in the file system to the root * of these mount (mounts_equal, mnt_build_ids_tree, * find_fsroot_mount_for, find_best_external_match, etc.) */ me->root = NO_ROOT_MOUNT; } mi->root = xstrdup(me->root); if (!mi->root) return -1; if (!me->ext_key) goto out; /* * External mount point -- get the reverse mapping * from the command line and put into root's place */ if (!strcmp(me->ext_key, AUTODETECTED_MOUNT)) { if (!opts.autodetect_ext_mounts) { pr_err("Mount %d:%s is autodetected external mount. " "Try \"--ext-mount-map auto\" to allow them.\n", mi->mnt_id, mi->ns_mountpoint); return -1; } /* * Make up an external mount entry for this * mount point, since we couldn't find a user * supplied one. * * The 'val' was put into mi->source during * dump by resolve_external_mounts(). */ ext = mi->source; } else if (!strcmp(me->ext_key, EXTERNAL_DEV_MOUNT)) { ext = EXTERNAL_DEV_MOUNT; } else { ext = ext_mount_lookup(me->ext_key); if (!ext) { pr_err("No mapping for %d:%s mountpoint\n", mi->mnt_id, mi->ns_mountpoint); return -1; } } mi->external = ext; out: pr_debug("\t\tWill mount %d from %s%s\n", mi->mnt_id, ext ?: mi->root, ext ? " (E)" : ""); return 0; } static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len) { int len; len = strlen(mountpoint) + root_len + 1; mi->mountpoint = xmalloc(len); if (!mi->mountpoint) return -1; /* * For bind-mounts we would also fix the root here * too, but bind-mounts restore merges mountpoint * and root paths together, so there's no need in * that. */ strcpy(mi->mountpoint, root); strcpy(mi->mountpoint + root_len, mountpoint); mi->ns_mountpoint = mi->mountpoint + root_len; mi->plain_mountpoint = get_plain_mountpoint(mi->mnt_id, NULL); if (!mi->plain_mountpoint) return -1; pr_debug("\t\tWill mount %d @ %s %s\n", mi->mnt_id, service_mountpoint(mi), mi->ns_mountpoint); return 0; } static char *mount_update_lsm_context(char *mount_opts) { cleanup_free char *before_context = NULL; char *other_options; char *context_start; char *context_end; char *old_context; char *new_options; int ret; old_context = strstr(mount_opts, CONTEXT_OPT); if (!old_context || !opts.lsm_mount_context) return xstrdup(mount_opts); /* * If the user specified a different mount_context we need * to replace the existing mount context in the mount * options with the one specified by the user. * * The original mount options will be something like: * * context="system_u:object_r:container_file_t:s0:c82,c137",inode64 * * and it needs to be replaced with opts.lsm_mount_context. * * The content between 'context=' and ',inode64' will be replaced * with opts.lsm_mount_context in quotes. */ /* Skip 'context=' */ context_start = old_context + strlen(CONTEXT_OPT); if (context_start[0] == '"' && context_start + 1 < mount_opts + strlen(mount_opts)) { /* Skip quotes */ context_end = strchr(context_start + 1, '"'); if (!context_end) { pr_err("Failed parsing mount option 'context'\n"); return NULL; } } else { context_end = context_start; } /* Find next after optionally skipping quotes. */ other_options = strchr(context_end, ','); before_context = xstrdup(mount_opts); if (unlikely(!before_context)) return NULL; before_context[context_start - mount_opts] = 0; ret = asprintf(&new_options, "%s\"%s\"%s", before_context, opts.lsm_mount_context, other_options ? other_options : ""); if (unlikely(ret < 0)) return NULL; pr_debug("\t\tChanged mount 'context=' to %s\n", new_options); return new_options; } static int collect_mnt_from_image(struct mount_info **head, struct mount_info **tail, struct ns_id *nsid) { MntEntry *me = NULL; int ret, root_len = 1; struct cr_img *img; char root[PATH_MAX] = "."; img = open_image(CR_FD_MNTS, O_RSTR, nsid->id); if (!img) return -1; root_len = print_ns_root(nsid, 0, root, sizeof(root)); pr_debug("Reading mountpoint images (id %d pid %d)\n", nsid->id, (int)nsid->ns_pid); while (1) { struct mount_info *pm; ret = pb_read_one_eof(img, &me, PB_MNT); if (ret <= 0) break; pm = mnt_entry_alloc(true); if (!pm) goto err; pm->nsid = nsid; mntinfo_add_list_before(head, pm); if (!*tail) *tail = pm; pm->mnt_id = me->mnt_id; pm->parent_mnt_id = me->parent_mnt_id; pm->s_dev = me->root_dev; pm->flags = me->flags; pm->sb_flags = me->sb_flags; if (!me->has_sb_flags) { const unsigned int mflags = MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | MS_RELATIME; /* * In old images mnt and sb flags are saved together. * Here we separate them and save the old logic about MS_RDONLY. */ pm->sb_flags = pm->flags & ~mflags; pm->flags = pm->flags & mflags; } pm->shared_id = me->shared_id; pm->master_id = me->master_id; pm->need_plugin = me->with_plugin; pm->deleted = me->deleted; pm->is_ns_root = is_root(me->mountpoint); if (me->has_internal_sharing) pm->internal_sharing = me->internal_sharing; pm->source = xstrdup(me->source); if (!pm->source) goto err; pm->options = mount_update_lsm_context(me->options); if (unlikely(!pm->options)) goto err; if (me->fstype != FSTYPE__AUTO && me->fsname) { pr_err("fsname can be set only for FSTYPE__AUTO mounts\n"); goto err; } /* FIXME: abort unsupported early */ pm->fstype = decode_fstype(me->fstype); if (pm->fstype->collect && (pm->fstype->collect(pm) < 0)) goto err; if (me->fsname) { pm->fsname = xstrdup(me->fsname); if (!pm->fsname) goto err; } if (get_mp_root(me, pm)) goto err; if (get_mp_mountpoint(me->mountpoint, pm, root, root_len)) goto err; pr_debug("\t" "Read %d mp @ %s\n", pm->mnt_id, pm->ns_mountpoint); } if (me) mnt_entry__free_unpacked(me, NULL); close_image(img); return 0; err: close_image(img); return -1; } static int merge_mount_trees(void) { struct ns_id *nsid; root_yard_mp = mnt_entry_alloc(true); if (!root_yard_mp) return -1; root_yard_mp->mountpoint = mnt_roots; root_yard_mp->plain_mountpoint = xstrdup(mnt_roots); if (!root_yard_mp->plain_mountpoint) return -1; root_yard_mp->is_dir = true; root_yard_mp->mounted = true; root_yard_mp->mnt_bind_is_populated = true; root_yard_mp->is_overmounted = false; root_yard_mp->mnt_id = HELPER_MNT_ID; /* Merge mount trees together under root_yard_mp */ for (nsid = ns_ids; nsid; nsid = nsid->next) { struct mount_info *root; if (nsid->nd != &mnt_ns_desc) continue; root = nsid->mnt.mntinfo_tree; pr_debug("Mountpoint %d (@%s) moved to the root yard\n", root->mnt_id, root->ns_mountpoint); root->parent = root_yard_mp; list_add(&root->siblings, &root_yard_mp->children); } return 0; } int read_mnt_ns_img(void) { struct mount_info *pms = NULL; struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) { mntinfo = NULL; return 0; } for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { struct mount_info *head = NULL, *tail = NULL; if (nsid->nd != &mnt_ns_desc) continue; if (collect_mnt_from_image(&head, &tail, nsid)) return -1; nsid->mnt.mntinfo_tree = mnt_build_tree(head); if (!nsid->mnt.mntinfo_tree) return -1; /* mntns root mounts are always directories */ nsid->mnt.mntinfo_tree->is_dir = true; tail->next = pms; pms = head; } mntinfo = pms; search_bindmounts(); prepare_is_overmounted(); if (!opts.mntns_compat_mode && resolve_shared_mounts_v2()) return -1; if (merge_mount_trees()) return -1; return 0; } int rst_get_mnt_root(int mnt_id, char *path, int plen) { struct mount_info *m; if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1) goto rroot; m = lookup_mnt_id(mnt_id); if (m == NULL) return -1; return print_ns_root(m->nsid, 0, path, plen); rroot: path[0] = '/'; path[1] = '\0'; return 1; } int mntns_maybe_create_roots(void) { if (!(root_ns_mask & CLONE_NEWNS)) return 0; return create_mnt_roots(); } static int do_restore_task_mnt_ns(struct ns_id *nsid) { int fd; fd = fdstore_get(nsid->mnt.nsfd_id); if (fd < 0) return -1; if (setns(fd, CLONE_NEWNS)) { pr_perror("Can't restore mntns"); close(fd); return -1; } close(fd); return 0; } int restore_task_mnt_ns(struct pstree_item *current) { if ((root_ns_mask & CLONE_NEWNS) == 0) return 0; if (current->ids && current->ids->has_mnt_ns_id) { struct pstree_item *parent = current->parent; unsigned int id = current->ids->mnt_ns_id; struct ns_id *nsid; /* Zombies and helpers can have ids == 0 so we skip them */ while (parent && !parent->ids) parent = parent->parent; /** * Our parent had restored the mount namespace before forking * us and if we have the same mntns we just stay there. */ if (parent && id == parent->ids->mnt_ns_id) return 0; nsid = lookup_ns_by_id(id, &mnt_ns_desc); if (nsid == NULL) { pr_err("Can't find mount namespace %d\n", id); return -1; } BUG_ON(nsid->type == NS_CRIU); if (do_restore_task_mnt_ns(nsid)) return -1; } return 0; } void fini_restore_mntns(void) { struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; nsid->ns_populated = true; } } /* * All nested mount namespaces are restore as sub-trees of the root namespace. */ static int populate_roots_yard(struct mount_info *cr_time) { struct mnt_remap_entry *r; char path[PATH_MAX]; struct ns_id *nsid; if (make_yard(mnt_roots)) return -1; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; print_ns_root(nsid, 0, path, sizeof(path)); if (mkdir(path, 0600)) { pr_perror("Unable to create %s", path); return -1; } } /* * mnt_remap_list is filled in find_remap_mounts() and * contains mounts which has to be restored separately */ list_for_each_entry(r, &mnt_remap_list, node) { if (mkdirpat(AT_FDCWD, service_mountpoint(r->mi), 0755)) { pr_perror("Unable to create %s", service_mountpoint(r->mi)); return -1; } } if (cr_time && mkdirpat(AT_FDCWD, service_mountpoint(cr_time), 0755)) { pr_perror("Unable to create %s", service_mountpoint(cr_time)); return -1; } return 0; } static int populate_mnt_ns(void) { struct mount_info *cr_time = NULL; int ret; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ cr_time = add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true); if (!cr_time) return -1; } #endif if (resolve_shared_mounts(mntinfo)) return -1; if (validate_mounts(mntinfo, false)) return -1; if (find_remap_mounts(root_yard_mp)) return -1; if (populate_roots_yard(cr_time)) return -1; if (mount_clean_path()) return -1; ret = mnt_tree_for_each(root_yard_mp, do_mount_one); mnt_tree_for_each(root_yard_mp, do_close_one); if (ret == 0) { struct mount_info *mi; /* * Mounts in delayed_unbindable list were temporary mounted as * private instead of unbindable so that do_mount_one can bind * from them, now we are ready to fix it. */ list_for_each_entry(mi, &delayed_unbindable, mnt_unbindable) if (set_unbindable(mi)) return -1; } if (ret == 0 && fixup_remap_mounts()) return -1; if (umount_clean_path()) return -1; return ret; } static int __depopulate_roots_yard(void) { int ret = 0; if (mnt_roots == NULL) return 0; if (mount("none", mnt_roots, "none", MS_REC | MS_PRIVATE, NULL)) { pr_perror("Can't remount root with MS_PRIVATE"); ret = 1; } /* * Don't exit after a first error, because this function * can be used to rollback in a error case. * Don't worry about MNT_DETACH, because files are restored after this * and nobody will not be restored from a wrong mount namespace. */ if (umount2(mnt_roots, MNT_DETACH)) { pr_perror("Can't unmount %s", mnt_roots); ret = -1; } if (rmdir(mnt_roots)) { pr_perror("Can't remove the directory %s", mnt_roots); ret = -1; } return ret; } int depopulate_roots_yard(int mntns_fd, bool only_ghosts) { int ret = 0, old_cwd = -1, old_ns = -1; if (mntns_fd < 0) { ret |= try_clean_remaps(only_ghosts); cleanup_mnt_ns(); return ret; } pr_info("Switching to new ns to clean ghosts\n"); old_cwd = open(".", O_PATH); if (old_cwd < 0) { pr_perror("Unable to open cwd"); return -1; } old_ns = open_proc(PROC_SELF, "ns/mnt"); if (old_ns < 0) { pr_perror("`- Can't keep old ns"); close(old_cwd); return -1; } if (setns(mntns_fd, CLONE_NEWNS) < 0) { pr_perror("`- Can't switch"); close(old_ns); close(old_cwd); return -1; } if (try_clean_remaps(only_ghosts)) ret = -1; if (__depopulate_roots_yard()) ret = -1; if (setns(old_ns, CLONE_NEWNS) < 0) { pr_perror("Fail to switch back!"); ret = -1; } close(old_ns); if (fchdir(old_cwd)) { pr_perror("Unable to restore cwd"); ret = -1; } close(old_cwd); return ret; } void cleanup_mnt_ns(void) { if (mnt_roots == NULL) return; if (rmdir(mnt_roots)) pr_perror("Can't remove the directory %s", mnt_roots); } int prepare_mnt_ns(void) { int ret = -1, rst = -1, fd; struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc }; struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return 0; pr_info("Restoring mount namespace\n"); if (!opts.root) { struct mount_info *old; if (chdir("/")) { pr_perror("chdir(\"/\") failed"); return -1; } old = collect_mntinfo(&ns, false); if (old == NULL) return -1; /* * The new mount namespace is filled with the mountpoint * clones from the original one. We have to umount them * prior to recreating new ones. */ pr_info("Cleaning mount namespace\n"); if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one)) { free_mntinfo(old); return -1; } free_mntinfo(old); } if (!opts.mntns_compat_mode) return prepare_mnt_ns_v2(); ret = populate_mnt_ns(); if (ret) return -1; rst = open_proc(PROC_SELF, "ns/mnt"); if (rst < 0) return -1; /* restore non-root namespaces */ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { char path[PATH_MAX]; if (nsid->nd != &mnt_ns_desc) continue; /* Create the new mount namespace */ if (unshare(CLONE_NEWNS)) { pr_perror("Unable to create a new mntns"); goto err; } fd = open_proc(PROC_SELF, "ns/mnt"); if (fd < 0) goto err; if (nsid->type == NS_ROOT) { /* * We need to create a mount namespace which will be * used to clean up remap files * (depopulate_roots_yard). The namespace where mounts * was restored has to be restored as a root mount * namespace, because there are file descriptors * linked with it (e.g. to bind-mount slave pty-s). */ if (setns(rst, CLONE_NEWNS)) { pr_perror("Can't restore mntns back"); goto err; } SWAP(rst, fd); } /* Pin one with a file descriptor */ nsid->mnt.nsfd_id = fdstore_add(fd); close(fd); if (nsid->mnt.nsfd_id < 0) { pr_err("Can't add ns fd\n"); goto err; } /* Set its root */ print_ns_root(nsid, 0, path, sizeof(path) - 1); if (cr_pivot_root(path)) goto err; /* root fd is used to restore file mappings */ fd = open_proc(PROC_SELF, "root"); if (fd < 0) goto err; nsid->mnt.root_fd_id = fdstore_add(fd); if (nsid->mnt.root_fd_id < 0) { pr_err("Can't add root fd\n"); close(fd); goto err; } close(fd); /* And return back to regain the access to the roots yard */ if (setns(rst, CLONE_NEWNS)) { pr_perror("Can't restore mntns back"); goto err; } } close(rst); return ret; err: if (rst >= 0) /* coverity[check_return] */ restore_ns(rst, &mnt_ns_desc); return -1; } static int mntns_root_pid = -1; static int mntns_set_root_fd(pid_t pid, int fd) { int ret; ret = install_service_fd(ROOT_FD_OFF, fd); if (ret >= 0) mntns_root_pid = pid; return ret; } int __mntns_get_root_fd(pid_t pid) { int fd, pfd; int ret; char path[PATH_MAX + 1]; if (mntns_root_pid == pid) /* The required root is already opened */ return get_service_fd(ROOT_FD_OFF); if (!(root_ns_mask & CLONE_NEWNS)) { /* * If criu and tasks we dump live in the same mount * namespace, we can just open the root directory. * All paths resolution would occur relative to criu's * root. Even if it is not namespace's root, provided * file paths are resolved, we'd get consistent dump. */ fd = open("/", O_RDONLY | O_DIRECTORY); if (fd < 0) { pr_perror("Can't open root"); return -1; } goto set_root; } /* * If /proc/pid/root links on '/', it signs that a root of the task * and a root of mntns is the same. */ pfd = open_pid_proc(pid); ret = readlinkat(pfd, "root", path, sizeof(path) - 1); if (ret < 0) { close_pid_proc(); return ret; } path[ret] = '\0'; if (ret != 1 || path[0] != '/') { pr_err("The root task has another root than mntns: %s\n", path); close_pid_proc(); return -1; } fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0); if (fd < 0) { pr_perror("Can't open the task root"); return -1; } set_root: return mntns_set_root_fd(pid, fd); } int mntns_get_root_fd(struct ns_id *mntns) { if (!(root_ns_mask & CLONE_NEWNS)) return __mntns_get_root_fd(0); if (!mntns) return -1; /* * All namespaces are restored from the root task and during the * CR_STATE_FORKING stage the root task has two file descriptors for * each mntns. One is associated with a namespace and another one is a * root of this mntns. * * When a non-root task is forked, it enters into a proper mount * namespace, restores private mappings and forks children. Some of * these mappings can be associated with files from other namespaces. * * After the CR_STATE_FORKING stage the root task has to close all * mntns file descriptors to restore its descriptors and at this moment * we know that all tasks live in their mount namespaces. * * If we find that a mount namespace isn't populated, we can get its * root from the root task. */ if (!mntns->ns_populated) { int fd; fd = fdstore_get(mntns->mnt.root_fd_id); if (fd < 0) return -1; return mntns_set_root_fd(mntns->ns_pid, fd); } return __mntns_get_root_fd(mntns->ns_pid); } struct ns_id *lookup_nsid_by_mnt_id(int mnt_id) { struct mount_info *mi; /* * Kernel before 3.15 doesn't show mnt_id for file descriptors. * mnt_id isn't saved for files, if mntns isn't dumped. * In both these cases we have only one root, so here * is not matter which mount will be restored. */ if (mnt_id == -1) mi = mntinfo; else mi = lookup_mnt_id(mnt_id); return mi ? mi->nsid : NULL; } int mntns_get_root_by_mnt_id(int mnt_id) { struct ns_id *mntns = NULL; if (root_ns_mask & CLONE_NEWNS) { mntns = lookup_nsid_by_mnt_id(mnt_id); BUG_ON(mntns == NULL); } return mntns_get_root_fd(mntns); } struct collect_mntns_arg { bool need_to_validate; bool for_dump; }; static int collect_mntns(struct ns_id *ns, void *__arg) { struct collect_mntns_arg *arg = __arg; struct mount_info *pms; pms = collect_mntinfo(ns, arg->for_dump); if (!pms) return -1; if (arg->for_dump && ns->type != NS_CRIU) arg->need_to_validate = true; mntinfo_add_list(pms); return 0; } int collect_mnt_namespaces(bool for_dump) { struct collect_mntns_arg arg; int ret; arg.for_dump = for_dump; arg.need_to_validate = false; ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg); if (ret) goto err; search_bindmounts(); #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (for_dump && !opts.has_binfmt_misc) { unsigned int s_dev = 0; struct ns_id *ns; for (ns = ns_ids; ns != NULL; ns = ns->next) { if (ns->type == NS_ROOT && ns->nd == &mnt_ns_desc) break; } if (ns) { ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); if (ret == -EPERM) pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); goto err; } else if (ret == 0) { ret = -1; goto err; } else if (ret > 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, s_dev, false)) { ret = -1; goto err; } } } #endif ret = resolve_external_mounts(mntinfo); if (ret) goto err; if (arg.need_to_validate) { ret = -1; if (resolve_shared_mounts(mntinfo)) goto err; if (validate_mounts(mntinfo, true)) goto err; } ret = 0; err: return ret; } int dump_mnt_namespaces(void) { struct ns_id *nsid; if (!(root_ns_mask & CLONE_NEWNS)) return 0; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU) continue; if ((nsid->type == NS_OTHER) && check_mnt_id()) { pr_err("Nested mount namespaces are not supported " "without mnt_id in fdinfo\n"); return -1; } if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list)) return -1; } return 0; } void clean_cr_time_mounts(void) { struct mount_info *mi; int ns_old, ret; for (mi = mntinfo; mi; mi = mi->next) { int cwd_fd; if (mi->mnt_id != HELPER_MNT_ID) continue; ret = switch_mnt_ns(mi->nsid->ns_pid, &ns_old, &cwd_fd); if (ret) { pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid); continue; } if (umount(mi->ns_mountpoint) < 0) pr_perror("Can't umount forced mount %s", mi->ns_mountpoint); if (restore_mnt_ns(ns_old, &cwd_fd)) { pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n"); return; } } } struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt"); static int call_helper_process(int (*call)(void *), void *arg) { int pid, status, exit_code = -1; /* * Running new helper process on the restore must be * done under last_pid mutex: other tasks may be restoring * threads and the PID we need there might be occupied by * this clone() call. */ lock_last_pid(); pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); if (pid == -1) { pr_perror("Can't clone helper process"); goto out; } errno = 0; if (waitpid(pid, &status, __WALL) != pid) { pr_perror("Unable to wait %d", pid); goto out; } if (status) { pr_err("Bad child exit status: %d\n", status); goto out; } exit_code = 0; out: unlock_last_pid(); return exit_code; } static int ns_remount_writable(void *arg) { struct mount_info *mi = (struct mount_info *)arg; struct ns_id *ns = mi->nsid; if (do_restore_task_mnt_ns(ns)) return 1; pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->ns_mountpoint); return 1; } return 0; } int try_remount_writable(struct mount_info *mi, bool ns) { int remounted = REMOUNTED_RW; /* Don't remount if we are in host mntns to be on the safe side */ if (!(root_ns_mask & CLONE_NEWNS)) return 0; if (!ns) remounted = REMOUNTED_RW_SERVICE; /* All mounts in mntinfo list should have it on restore */ BUG_ON(mi->rmi == NULL); if (mi->flags & MS_RDONLY && !(mi->rmi->remounted_rw & remounted)) { if (mnt_is_overmounted(mi)) { pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id); return -1; } /* There should be no ghost files on mounts with ro sb */ if (mi->sb_flags & MS_RDONLY) { pr_err("The mount %d has readonly sb\n", mi->mnt_id); return -1; } pr_info("Remount %d:%s writable\n", mi->mnt_id, service_mountpoint(mi)); if (!ns) { if (mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { pr_perror("Failed to remount %d:%s writable", mi->mnt_id, service_mountpoint(mi)); return -1; } } else { if (call_helper_process(ns_remount_writable, mi)) return -1; } mi->rmi->remounted_rw |= remounted; } return 0; } static int __remount_readonly_mounts(struct ns_id *ns) { struct mount_info *mi; bool mntns_set = false; for (mi = mntinfo; mi; mi = mi->next) { if (ns && mi->nsid != ns) continue; if (!(mi->rmi->remounted_rw & REMOUNTED_RW)) continue; /* * Lets enter the mount namespace lazily, only if we've found the * mount which should be remounted readonly. These saves us * from entering mntns if we have no mounts to remount in it. */ if (ns && !mntns_set) { if (do_restore_task_mnt_ns(ns)) return -1; mntns_set = true; pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); } pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->ns_mountpoint); if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~MS_PROPAGATE), NULL)) { pr_perror("Failed to restore %d:%s mount flags %x", mi->mnt_id, mi->ns_mountpoint, mi->flags); return -1; } } return 0; } static int ns_remount_readonly_mounts(void *arg) { struct ns_id *nsid; for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { if (nsid->nd != &mnt_ns_desc) continue; if (__remount_readonly_mounts(nsid)) return 1; } return 0; } int remount_readonly_mounts(void) { /* * Need a helper process because the root task can share fs via * CLONE_FS and we would not be able to enter mount namespaces */ return call_helper_process(ns_remount_readonly_mounts, NULL); } static struct mount_info *mnt_subtree_next(struct mount_info *mi, struct mount_info *root) { if (!list_empty(&mi->children)) return list_entry(mi->children.next, struct mount_info, siblings); while (mi->parent && mi != root) { if (mi->siblings.next == &mi->parent->children) mi = mi->parent; else return list_entry(mi->siblings.next, struct mount_info, siblings); } return NULL; }