1 files changed, 829 insertions, 0 deletions
diff --git a/nix/libutil/spawn.cc b/nix/libutil/spawn.cc
new file mode 100644
index 0000000000..93bab9f59e
--- /dev/null
+++ b/nix/libutil/spawn.cc
@@ -0,0 +1,829 @@
+/* GNU Guix --- Functional package management for GNU
+   Copyright (C) 2025 Caleb Ristvedt <reepca@russelstein.xyz>
+
+   This file is part of GNU Guix.
+
+   GNU Guix is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or (at
+   your option) any later version.
+
+   GNU Guix is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Process spawning and setup code.  */
+
+#include <spawn.hh>
+#include <util.hh>
+#include <affinity.hh>
+#include <stddef.h>
+#include <unistd.h>
+#include <grp.h>
+#include <limits.h>
+#include <sys/wait.h>
+#include <cstring>
+#include <cstdlib>
+
+#if HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#if HAVE_SCHED_H
+#include <sched.h>
+#endif
+
+#if HAVE_STATVFS
+#include <sys/statvfs.h>
+#endif
+
+#if HAVE_SYS_SYSCALL_H
+#include <sys/syscall.h>
+#endif
+
+#if HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#ifdef __linux__
+#include <sys/personality.h>
+#endif
+
+#if defined(SYS_pivot_root)
+#define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root,put_old))
+#endif
+
+
+#define CLONE_ENABLED defined(CLONE_NEWNS)
+
+#if CLONE_ENABLED
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#endif
+
+namespace nix {
+
+
+void addPhaseAfter(Phases & phases, string afterLabel, string addLabel, Action addAction)
+{
+    for(auto i = phases.begin(); i != phases.end(); i++)
+        if((*i).label == afterLabel) {
+            i++; /* std::vector::insert inserts before, not after */
+            Phase p;
+            p.label = addLabel;
+            p.action = addAction;
+            phases.insert(i, p);
+            return;
+        }
+    throw Error(format("label `%1%' not found in phases") % afterLabel);
+}
+
+
+void addPhaseBefore(Phases & phases, string beforeLabel, string addLabel, Action addAction)
+{
+    for(auto i = phases.begin(); i != phases.end(); i++)
+        if((*i).label == beforeLabel) {
+            Phase p;
+            p.label = addLabel;
+            p.action = addAction;
+            phases.insert(i, p);
+            return;
+        }
+    throw Error(format("label `%1%' not found in phases") % beforeLabel);
+}
+
+
+void prependPhase(Phases & phases, string addLabel, Action addAction)
+{
+    Phase p;
+    p.label = addLabel;
+    p.action = addAction;
+    phases.insert(phases.begin(), p);
+}
+
+
+void appendPhase(Phases & phases, string addLabel, Action addAction)
+{
+    Phase p;
+    p.label = addLabel;
+    p.action = addAction;
+    phases.push_back(p);
+}
+
+
+void deletePhase(Phases & phases, string delLabel)
+{
+    for(auto i = phases.begin(); i != phases.end(); i++)
+        if((*i).label == delLabel) {
+            phases.erase(i);
+            return;
+        }
+    throw Error(format("label `%1%' not found in phases") % delLabel);
+}
+
+
+void replacePhase(Phases & phases, string replaceLabel, Action newAction)
+{
+    for(auto i = phases.begin(); i != phases.end(); i++)
+        if((*i).label == replaceLabel) {
+            (*i).action = newAction;
+            return;
+        }
+    throw Error(format("label `%1' not found in phases") % replaceLabel);
+}
+
+
+/* A curated selection of predefined actions */
+
+void reset_writeToStderrAction(SpawnContext & ctx)
+{
+    _writeToStderr = 0;
+}
+
+
+void restoreAffinityAction(SpawnContext & ctx)
+{
+    restoreAffinity();
+}
+
+
+void setsidAction(SpawnContext & ctx)
+{
+    /* Puts the current process in a separate session, which implies a
+       separate process group, so it doesn't receive group-directed signals
+       sent at the parent.  The new session initially has no controlling
+       terminal, so it also doesn't receive terminal signals and can't open
+       /dev/tty. */
+    if(ctx.setsid && setsid() == (pid_t)-1)
+        throw SysError("creating a new session");
+}
+
+
+void earlyIOSetupAction(SpawnContext & ctx)
+{
+    for(auto i = ctx.earlyCloseFDs.begin(); i != ctx.earlyCloseFDs.end(); i++)
+        if(close(*i) == -1)
+            throw SysError("closing fd");
+
+    if(ctx.logFD != -1) {
+        if(dup2(ctx.logFD, STDOUT_FILENO) == -1)
+            throw SysError("cannot dup2 log fd into stdout fd");
+        if(dup2(ctx.logFD, STDERR_FILENO) == -1)
+            throw SysError("cannot dup2 log fd into stderr fd");
+    }
+
+    if(ctx.setStdin) {
+        if(ctx.stdinFD != -1) {
+            if(dup2(ctx.stdinFD, STDIN_FILENO) == -1)
+                throw SysError("cannot dup2 fd into stdin fd");
+        }
+        else {
+            /* Doesn't make sense for it to be writable, but compatibility... */
+            AutoCloseFD fd = open(ctx.stdinFile.c_str(), O_RDWR);
+            if(fd == -1)
+                throw SysError(format("cannot open `%1%'") % ctx.stdinFile);
+            if(dup2(fd, STDIN_FILENO) == -1)
+                throw SysError("cannot dup2 fd into stdin fd");
+        }
+    }
+}
+
+
+void dropAmbientCapabilitiesAction(SpawnContext & ctx)
+{
+  /* Drop ambient capabilities such as CAP_CHOWN that might have been granted
+     when starting guix-daemon.  */
+    if(ctx.dropAmbientCapabilities)
+#if HAVE_SYS_PRCTL_H
+        prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0);
+#else
+        throw Error("dropping ambient capabilities is not supported on this system");
+#endif
+}
+
+
+void chrootAction(SpawnContext & ctx)
+{
+    if(ctx.doChroot)
+#if HAVE_CHROOT
+        if(chroot(ctx.chrootRootDir.c_str()) == -1)
+            throw SysError(format("cannot change root directory to '%1%'") % ctx.chrootRootDir);
+#else
+    throw Error("chroot is not supported on this system");
+#endif
+}
+
+
+void chdirAction(SpawnContext & ctx)
+{
+    if(ctx.setcwd)
+        if(chdir(ctx.cwd.c_str()) == -1)
+            throw SysError(format("changing into `%1%'") % ctx.cwd);
+}
+
+
+void closeMostFDsAction(SpawnContext & ctx)
+{
+    if(ctx.closeMostFDs) closeMostFDs(ctx.preserveFDs);
+    for(auto i = ctx.preserveFDs.begin(); i != ctx.preserveFDs.end(); i++)
+        keepOnExec(*i);
+}
+
+
+void setPersonalityAction(SpawnContext & ctx)
+{
+  if(ctx.setPersona)
+#ifdef __linux__
+    if(personality(ctx.persona) == -1)
+      throw SysError("cannot set personality");
+#else
+    throw Error("setting the personality is not supported on this system");
+#endif
+}
+
+
+void oomSacrificeAction(SpawnContext & ctx)
+{
+#ifdef __linux__
+    if(ctx.oomSacrifice)
+        /* Ask the kernel to eagerly kill us & our children if it runs out of
+           memory, regardless of blame, to preserve ‘real’ user data &
+           state. */
+        try {
+            writeFile("/proc/self/oom_score_adj", "1000"); // 100%
+        } catch(...) { ignoreException(); }
+#endif
+}
+
+
+void setIDsAction(SpawnContext & ctx)
+{
+    if(ctx.setSupplementaryGroups)
+        if(setgroups(ctx.supplementaryGroups.size(),
+                     ctx.supplementaryGroups.data()) == -1)
+            throw SysError("cannot set supplementary groups");
+
+    if(ctx.setgid)
+        if(setgid(ctx.group) == -1 ||
+           getgid() != ctx.group ||
+           getegid() != ctx.group)
+            throw SysError("setgid failed");
+
+    if(ctx.setuid)
+        if(setuid(ctx.user) == -1 ||
+           getuid() != ctx.user ||
+           geteuid() != ctx.user)
+            throw SysError("setuid failed");
+}
+
+
+void restoreSIGPIPEAction(SpawnContext & ctx)
+{
+    /* Restore default handling of SIGPIPE, otherwise some programs will
+       randomly say "Broken pipe". */
+    struct sigaction act, oact;
+    act.sa_handler = SIG_DFL;
+    act.sa_flags = 0;
+    sigemptyset(&act.sa_mask);
+    if (sigaction(SIGPIPE, &act, &oact)) throw SysError("resetting SIGPIPE");
+}
+
+
+void setupSuccessAction(SpawnContext & ctx)
+{
+    if(ctx.signalSetupSuccess)
+        writeFull(STDERR_FILENO, "\n");
+}
+
+
+void execAction(SpawnContext & ctx)
+{
+    Strings envStrs;
+    std::vector<char *> envPtrs;
+    char **env;
+    if(ctx.inheritEnv) {
+        for(auto i = ctx.env.begin(); i != ctx.env.end(); i++)
+            if(setenv(i->first.c_str(), i->second.c_str(), 1) == -1)
+                throw SysError("setenv");
+        env = environ;
+    } else {
+        for(auto i = ctx.env.begin(); i != ctx.env.end(); i++)
+            envStrs.push_back(i->first + "=" + i->second);
+        /* Need to keep the envPtrs vector alive as long as its .data()! */
+        envPtrs = stringsToCharPtrs(envStrs);
+        env = envPtrs.data();
+    }
+    if(execvpe(ctx.program.c_str(), stringsToCharPtrs(ctx.args).data(), env) == -1)
+        throw SysError(format("executing `%1%'") % ctx.program);
+}
+
+
+Phases getBasicSpawnPhases()
+{
+    return { { "reset_writeToStderr",     reset_writeToStderrAction },
+             { "restoreAffinity",         restoreAffinityAction },
+             { "setsid",                  setsidAction },
+             { "earlyIOSetup",            earlyIOSetupAction },
+             { "dropAmbientCapabilities", dropAmbientCapabilitiesAction },
+             { "chroot",                  chrootAction },
+             { "chdir",                   chdirAction },
+             { "closeMostFDs",            closeMostFDsAction },
+             { "setPersonality",          setPersonalityAction },
+             { "oomSacrifice",            oomSacrificeAction },
+             { "setIDs",                  setIDsAction },
+             { "restoreSIGPIPE",          restoreSIGPIPEAction },
+             { "setupSuccess",            setupSuccessAction },
+             { "exec",                    execAction } };
+}
+
+
+void usernsInitSyncAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+        /* Close the earlyCloseFDs before we try reading anything */
+        for(auto i = ctx.earlyCloseFDs.begin(); i != ctx.earlyCloseFDs.end(); i++)
+            if(close(*i) == -1)
+                throw SysError("closing fd");
+        /* Don't try closing them again later */
+        ctx.earlyCloseFDs.clear();
+        /* Wait for the parent process to initialize the UID/GID mapping of
+           our user namespace.  */
+        waitForMessage(ctx.setupFD, "go\n");
+    }
+#endif
+}
+
+
+void usernsSetIDsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+        /* Note: 'man capabilities' says that a transition from zero to
+           nonzero uids causes capabilities to be lost, but doesn't say what
+           happens when a transition from an unmapped (possibly zero) uid to a
+           nonzero uid happens. */
+        if(ctx.usernsSetuid)
+            /* Since we presumably have CAP_SETUID, this sets the real,
+               effective, saved, and filesystem uids */
+            if(setuid(ctx.usernsUser) != 0)
+                throw SysError("setuid");
+        if(ctx.usernsSetgid)
+            /* Ditto but with gids */
+            if(setgid(ctx.usernsGroup) != 0)
+                throw SysError("setgid");
+    }
+#endif
+}
+
+
+void initLoopbackAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if(((ctx.cloneFlags & CLONE_NEWNET) != 0) && ctx.initLoopback) {
+        AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP));
+        if (fd == -1) throw SysError("cannot open IP socket");
+
+        struct ifreq ifr;
+        strcpy(ifr.ifr_name, "lo");
+        ifr.ifr_flags = IFF_UP | IFF_LOOPBACK | IFF_RUNNING;
+        if (ioctl(fd, SIOCSIFFLAGS, &ifr) == -1)
+            throw SysError("cannot set loopback interface flags");
+
+        fd.close();
+    }
+#endif
+}
+
+
+void setHostAndDomainAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWUTS) != 0) {
+        if (sethostname(ctx.hostname.c_str(),
+                        strlen(ctx.hostname.c_str())) == -1)
+            throw SysError("cannot set host name");
+        if (setdomainname(ctx.domainname.c_str(),
+                          strlen(ctx.domainname.c_str())) == -1)
+            throw SysError("cannot set domain name");
+    }
+#endif
+}
+
+
+void makeFilesystemsPrivateAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_REC) && defined(MS_PRIVATE)
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWNS) != 0) {
+        if(mount(0, "/", 0, MS_REC|MS_PRIVATE, 0) == -1)
+            throw SysError("unable to make `/' private mount");
+    }
+#endif
+}
+
+
+void makeChrootSeparateFilesystemAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_BIND)
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if(((ctx.cloneFlags & CLONE_NEWNS) != 0) && ctx.doChroot) {
+        /* Bind-mount chroot directory to itself, to treat it as a different
+           filesystem from /, as needed for pivot_root.  Alternatively, mount
+           a tmpfs on it. */
+        if(ctx.mountTmpfsOnChroot) {
+            if(mount("none", ctx.chrootRootDir.c_str(), "tmpfs", 0, 0) == -1)
+                throw SysError(format("unable to mount tmpfs on `%1%'") % ctx.chrootRootDir);
+        }
+        else {
+            if(mount(ctx.chrootRootDir.c_str(), ctx.chrootRootDir.c_str(), 0, MS_BIND, 0) == -1)
+                throw SysError(format("unable to bind mount ‘%1%’") % ctx.chrootRootDir);
+        }
+    }
+#endif
+}
+
+
+static int statfsToMountFlags(int f_flags)
+{
+#if HAVE_SYS_MOUNT_H && HAVE_STATVFS
+    int ret = 0;
+#if defined(ST_RDONLY) && defined(MS_RDONLY)
+    if((f_flags & ST_RDONLY) != 0)     ret |= MS_RDONLY;
+#endif
+#if defined(ST_NOSUID) && defined(MS_NOSUID)
+    if((f_flags & ST_NOSUID) != 0)     ret |= MS_NOSUID;
+#endif
+#if defined(ST_NODEV) && defined(MS_NODEV)
+    if((f_flags & ST_NODEV) != 0)      ret |= MS_NODEV;
+#endif
+#if defined(ST_NOEXEC) && defined(MS_NOEXEC)
+    if((f_flags & ST_NOEXEC) != 0)     ret |= MS_NOEXEC;
+#endif
+#if defined(ST_NOATIME) && defined(MS_NOATIME)
+    if((f_flags & ST_NOATIME) != 0)    ret |= MS_NOATIME;
+#endif
+#if defined(ST_NODIRATIME) && defined(MS_NODIRATIME)
+    if((f_flags & ST_NODIRATIME) != 0) ret |= MS_NODIRATIME;
+#endif
+#if defined(ST_RELATIME) && defined(MS_RELATIME)
+    if((f_flags & ST_RELATIME) != 0)   ret |= MS_RELATIME;
+#endif
+    return ret;
+#else
+    throw Error("statfsToMountFlags not supported on this platform");
+#endif
+}
+
+
+void bindMount(Path source, Path target, bool readOnly)
+{
+#if HAVE_SYS_MOUNT_H && defined(MS_BIND)
+    struct stat st;
+    if (lstat(source.c_str(), &st) == -1)
+        throw SysError(format("getting attributes of path `%1%'") % source);
+
+    if(S_ISDIR(st.st_mode))
+        createDirs(target);
+    else if(S_ISLNK(st.st_mode)) {
+        /* bind-mounts follow symlinks, thus representing their target and not
+           the symlink itself.  Create a copy of the symlink instead.*/
+        createDirs(dirOf(target));
+        createSymlink(readLink(source), target);
+        return;
+    }
+    else {
+        createDirs(dirOf(target));
+        writeFile(target, "");
+    }
+
+    /* This may fail with EINVAL unless we specify MS_REC, specifically if we
+       are in an unprivileged mount namespace and not specifying MS_REC would
+       reveal subtrees that had been covered up. */
+    if (mount(source.c_str(), target.c_str(), 0, MS_BIND|MS_REC, 0) == -1)
+        throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target);
+    if(readOnly) {
+#if defined(MS_REMOUNT) && defined(MS_RDONLY)
+        /* Extra flags passed with MS_BIND are ignored, hence the extra
+           MS_REMOUNT.  */
+        unsigned long mount_flags = MS_BIND | MS_REMOUNT | MS_RDONLY;
+        /* MS_BIND | MS_REMOUNT sets all mountpoint flags, so we may get EPERM
+           unless we preserve the other flags (for example because it would
+           result in trying to clear the nosuid flag). */
+#if HAVE_STATVFS
+        struct statvfs stvfs;
+        if(statvfs(target.c_str(), &stvfs) == -1)
+            throw SysError(format("statvfs of `%1%'") % target);
+        mount_flags |= statfsToMountFlags(stvfs.f_flag);
+#endif
+
+        if (mount(source.c_str(), target.c_str(), 0, mount_flags, 0) == -1)
+            throw SysError(format("read-only remount of `%1%' failed") % target);
+#else
+        throw Error("remounting read-only is not supported on this platform");
+#endif
+    }
+#endif
+}
+
+
+void mountIntoChroot(std::map<Path, Path> filesInChroot,
+                     set<Path> readOnlyFiles,
+                     Path chrootRootDir)
+{
+#if HAVE_SYS_MOUNT_H && defined(MS_BIND)
+    for(auto i = filesInChroot.begin(); i != filesInChroot.end(); i++) {
+        Path source = i->second;
+        Path target = chrootRootDir + i->first;
+        bool readOnly = readOnlyFiles.find(i->first) != readOnlyFiles.end();
+        bindMount(source, target, readOnly);
+    }
+#else
+    throw Error("bind mounting not supported on this platform");
+#endif
+}
+
+
+void mountIntoChrootAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_BIND)
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.doChroot) {
+        mountIntoChroot(ctx.filesInChroot, ctx.readOnlyFilesInChroot, ctx.chrootRootDir);
+    }
+#endif
+}
+
+
+void mountProcAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.mountProc) {
+        Path target = (ctx.doChroot ? ctx.chrootRootDir : "") + "/proc";
+        createDirs(target);
+        if(mount("none", target.c_str(), "proc", 0, 0) == -1)
+            throw SysError(format("mounting `%1%'") % target);
+    }
+#endif
+}
+
+
+void mountDevshmAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.mountDevshm) {
+        Path target = (ctx.doChroot ? ctx.chrootRootDir : "") + "/dev/shm";
+        createDirs(target);
+        if(mount("none", target.c_str(), "tmpfs", 0, 0) == -1)
+            throw SysError(format("mounting `%1%'") % target);
+    }
+#endif
+}
+
+
+void mountDevptsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.maybeMountDevpts) {
+        Path chroot = (ctx.doChroot ? ctx.chrootRootDir : "");
+        Path target = chroot + "/dev/pts";
+        if(pathExists(chroot + "/dev/ptmx")) return;
+        createDirs(target);
+        if(mount("none", target.c_str(), "devpts", 0, "newinstance,mode=0620") == -1)
+            throw SysError(format("mounting `%1%'") % target);
+        createSymlink("/dev/pts/ptmx", chroot + "/dev/ptmx");
+        /* Make sure /dev/pts/ptmx is world-writable.  With some Linux
+           versions, it is created with permissions 0.  */
+        Path targetPtmx = chroot + "/dev/pts/ptmx";
+        if (chmod(targetPtmx.c_str(), 0666) == -1)
+            throw SysError(format("setting permissions on `%1%'") % targetPtmx);
+    }
+#endif
+}
+
+
+void pivotRootAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.doChroot) {
+        if (chdir(ctx.chrootRootDir.c_str()) == -1)
+            throw SysError(format("cannot change directory to '%1%'") % ctx.chrootRootDir);
+
+        if (mkdir("real-root", 0) == -1)
+            throw SysError("cannot create real-root directory");
+
+        if (pivot_root(".", "real-root") == -1)
+            throw SysError(format("cannot pivot old root directory onto '%1%'") % (ctx.chrootRootDir + "/real-root"));
+
+        if (chroot(".") == -1)
+            throw SysError(format("cannot change root directory to '%1%'") % ctx.chrootRootDir);
+
+        if (umount2("real-root", MNT_DETACH) == -1)
+            throw SysError("cannot unmount real root filesystem");
+
+        if (rmdir("real-root") == -1)
+            throw SysError("cannot remove real-root directory");
+    }
+#endif
+}
+
+
+string idMapToIdentityMap(const string & map)
+{
+    std::vector<string> mapLines =
+        tokenizeString<std::vector<string> >(map, "\n");
+    string out;
+
+    for(auto & i : mapLines) {
+        std::vector<string> elements =
+            tokenizeString<std::vector<string> >(i, " ");
+        out.append(elements.at(0) + " " + elements.at(0) + " " + elements.at(2) + "\n");
+    }
+    return out;
+}
+
+
+/* Initializing a user namespace with more than one id mapped requires
+ * capabilities in the *parent* user namespace, which may not even have any
+ * processes in it after unshare is called.  So fork a child and have it do
+ * the initialization. */
+void unshareAndInitUserns(int flags, const string & uidMap,
+                          const string & gidMap, bool allowSetgroups)
+{
+#if CLONE_ENABLED
+    pid_t pid_ = getpid();
+    string pid = std::to_string(pid_);
+    Pipe toChild;
+    Pipe fromChild;
+    toChild.create();
+    fromChild.create();
+    pid_t child = fork();
+    if(child == -1)
+        throw SysError("creating child process");
+    if(child == 0) {
+        try {
+            toChild.writeSide.close();
+            fromChild.readSide.close();
+            waitForMessage(toChild.readSide, "ready\n");
+            writeFile("/proc/" + pid + "/uid_map", uidMap);
+            writeFile("/proc/" + pid + "/setgroups",
+                      allowSetgroups ? "allow" : "deny");
+            writeFile("/proc/" + pid + "/gid_map", gidMap);
+            writeFull(fromChild.writeSide, (unsigned char*)"go\n", 3);
+        } catch(...) {
+            /* Don't unwind the stack in case of exception, halt
+             * immediately. */
+            _exit(1);
+        }
+        _exit(EXIT_SUCCESS);
+    } else {
+        toChild.readSide.close();
+        fromChild.writeSide.close();
+        if(unshare(flags) == -1)
+            throw SysError("unshare");
+        writeFull(toChild.writeSide, (unsigned char*)"ready\n", 6);
+        waitForMessage(fromChild.readSide, "go\n");
+        int status;
+        while(waitpid(child, &status, 0) == -1) {
+            if(errno != EINTR)
+                throw SysError("reaping userns init process");
+        }
+        if(!(WIFEXITED(status) != 0 && WEXITSTATUS(status) == EXIT_SUCCESS))
+            throw Error(format("userns init child exited with status %1%") % WEXITSTATUS(status));
+    }
+#endif
+}
+
+
+void lockMountsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+    CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+    if(ctx.lockMounts) {
+        string uidMap;
+        string gidMap;
+        if(ctx.lockMountsMapAll) {
+            string oldUidMap = readFile("/proc/self/uid_map", true);
+            string oldGidMap = readFile("/proc/self/gid_map", true);
+            uidMap = idMapToIdentityMap(oldUidMap);
+            gidMap = idMapToIdentityMap(oldGidMap);
+        } else {
+            string uid = std::to_string(getuid());
+            string gid = std::to_string(getgid());
+            uidMap = uid + " " + uid + " 1";
+            gidMap = gid + " " + gid + " 1";
+        }
+        unshareAndInitUserns(CLONE_NEWNS | CLONE_NEWUSER,
+                             uidMap, gidMap, ctx.lockMountsAllowSetgroups);
+        /* Check that mounts inherited in our new mount namespace are "locked"
+           together and cannot be separated from within our mount namespace.
+           Since umount(2) is documented to fail with EINVAL when attempting
+           to unmount one of the mounts that are locked together, check that
+           this is what we get.  */
+        int ret = umount("/proc");
+        assert(ret == -1 && errno == EINVAL);
+    }
+#endif
+}
+
+
+Phases getCloneSpawnPhases()
+{
+#if CLONE_ENABLED
+    return { { "reset_writeToStderr",          reset_writeToStderrAction          },
+             { "usernsInitSync",               usernsInitSyncAction               },
+             { "usernsSetIDs",                 usernsSetIDsAction                 },
+             { "restoreAffinity",              restoreAffinityAction              },
+             { "setsid",                       setsidAction                       },
+             { "earlyIOSetup",                 earlyIOSetupAction                 },
+             { "dropAmbientCapabilities",      dropAmbientCapabilitiesAction      },
+             { "initLoopback",                 initLoopbackAction                 },
+             { "setHostAndDomain",             setHostAndDomainAction             },
+             { "makeFilesystemsPrivate",       makeFilesystemsPrivateAction       },
+             { "makeChrootSeparateFilesystem", makeChrootSeparateFilesystemAction },
+             { "mountIntoChroot",              mountIntoChrootAction              },
+             { "mountProc",                    mountProcAction                    },
+             { "mountDevshm",                  mountDevshmAction                  },
+             { "mountDevpts",                  mountDevptsAction                  },
+             { "chroot",                       pivotRootAction                    },
+             { "chdir",                        chdirAction                        },
+             { "closeMostFDs",                 closeMostFDsAction                 },
+             { "setPersonality",               setPersonalityAction               },
+             { "oomSacrifice",                 oomSacrificeAction                 },
+             /* Being put in a user namespace with only the current ids mapped
+                would tend to prevent switching to other ones, but if this
+                comes after setIDs then the per-process "dumpable" flag may be
+                reset, which will cause /proc/self to become root-owned,
+                making /proc/self/uid_map inaccessible.  If you need
+                lockMounts to preserve the id mappings, and you have the
+                necessary capabilities in the parent user namespace, set
+                CloneSpawnContext.lockMountsMapAll = true. */
+             { "lockMounts",                   lockMountsAction                   },
+             { "setIDs",                       setIDsAction                       },
+             { "restoreSIGPIPE",               restoreSIGPIPEAction               },
+             { "setupSuccess",                 setupSuccessAction                 },
+             { "exec",                         execAction                         }};
+#else
+    throw Error("clone not supported on this platform");
+#endif
+}
+
+
+void runChildSetup(SpawnContext & ctx)
+{
+    ctx.currentPhase = 0;
+    try {
+        /* Should not return regularly from this */
+        while(true) {
+            ctx.phases.at(ctx.currentPhase).action(ctx);
+            ctx.currentPhase++;
+        }
+    } catch (std::exception & e) {
+        try {
+            writeFull(STDERR_FILENO,
+                      "while setting up the child process: " +
+                      (ctx.currentPhase < (ssize_t)ctx.phases.size() ?
+                       "in phase " + ctx.phases[ctx.currentPhase].label + ": " : "") +
+                      string(e.what()) + "\n");
+        } catch (std::exception & e2) {
+            _exit(1);
+        }
+        _exit(1);
+    }
+    abort(); /* Should never be reached */
+}
+
+
+int runChildSetupEntry(void *data)
+{
+    runChildSetup(* (SpawnContext *)data);
+    return 1;
+}
+
+
+int cloneChild(CloneSpawnContext & ctx)
+{
+    char stack[32 * 1024];
+    /* Ensure proper alignment on the stack.  On aarch64, it has to be 16
+       bytes.  */
+    char *alignedStack = (char *)(((uintptr_t)stack + sizeof(stack) - 8) & ~(uintptr_t)0xf);
+    int ret = clone(runChildSetupEntry, alignedStack, ctx.cloneFlags, (void *) &ctx);
+    if(ret == -1)
+        throw SysError("clone");
+    return ret;
+}
+
+}