summaryrefslogtreecommitdiff
path: root/nix/libutil/spawn.cc
diff options
context:
space:
mode:
Diffstat (limited to 'nix/libutil/spawn.cc')
-rw-r--r--nix/libutil/spawn.cc829
1 files changed, 829 insertions, 0 deletions
diff --git a/nix/libutil/spawn.cc b/nix/libutil/spawn.cc
new file mode 100644
index 0000000000..93bab9f59e
--- /dev/null
+++ b/nix/libutil/spawn.cc
@@ -0,0 +1,829 @@
+/* GNU Guix --- Functional package management for GNU
+ Copyright (C) 2025 Caleb Ristvedt <reepca@russelstein.xyz>
+
+ This file is part of GNU Guix.
+
+ GNU Guix is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or (at
+ your option) any later version.
+
+ GNU Guix is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Process spawning and setup code. */
+
+#include <spawn.hh>
+#include <util.hh>
+#include <affinity.hh>
+#include <stddef.h>
+#include <unistd.h>
+#include <grp.h>
+#include <limits.h>
+#include <sys/wait.h>
+#include <cstring>
+#include <cstdlib>
+
+#if HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#if HAVE_SCHED_H
+#include <sched.h>
+#endif
+
+#if HAVE_STATVFS
+#include <sys/statvfs.h>
+#endif
+
+#if HAVE_SYS_SYSCALL_H
+#include <sys/syscall.h>
+#endif
+
+#if HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#ifdef __linux__
+#include <sys/personality.h>
+#endif
+
+#if defined(SYS_pivot_root)
+#define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root,put_old))
+#endif
+
+
+#define CLONE_ENABLED defined(CLONE_NEWNS)
+
+#if CLONE_ENABLED
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#endif
+
+namespace nix {
+
+
+void addPhaseAfter(Phases & phases, string afterLabel, string addLabel, Action addAction)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == afterLabel) {
+ i++; /* std::vector::insert inserts before, not after */
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.insert(i, p);
+ return;
+ }
+ throw Error(format("label `%1%' not found in phases") % afterLabel);
+}
+
+
+void addPhaseBefore(Phases & phases, string beforeLabel, string addLabel, Action addAction)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == beforeLabel) {
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.insert(i, p);
+ return;
+ }
+ throw Error(format("label `%1%' not found in phases") % beforeLabel);
+}
+
+
+void prependPhase(Phases & phases, string addLabel, Action addAction)
+{
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.insert(phases.begin(), p);
+}
+
+
+void appendPhase(Phases & phases, string addLabel, Action addAction)
+{
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.push_back(p);
+}
+
+
+void deletePhase(Phases & phases, string delLabel)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == delLabel) {
+ phases.erase(i);
+ return;
+ }
+ throw Error(format("label `%1%' not found in phases") % delLabel);
+}
+
+
+void replacePhase(Phases & phases, string replaceLabel, Action newAction)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == replaceLabel) {
+ (*i).action = newAction;
+ return;
+ }
+ throw Error(format("label `%1' not found in phases") % replaceLabel);
+}
+
+
+/* A curated selection of predefined actions */
+
+void reset_writeToStderrAction(SpawnContext & ctx)
+{
+ _writeToStderr = 0;
+}
+
+
+void restoreAffinityAction(SpawnContext & ctx)
+{
+ restoreAffinity();
+}
+
+
+void setsidAction(SpawnContext & ctx)
+{
+ /* Puts the current process in a separate session, which implies a
+ separate process group, so it doesn't receive group-directed signals
+ sent at the parent. The new session initially has no controlling
+ terminal, so it also doesn't receive terminal signals and can't open
+ /dev/tty. */
+ if(ctx.setsid && setsid() == (pid_t)-1)
+ throw SysError("creating a new session");
+}
+
+
+void earlyIOSetupAction(SpawnContext & ctx)
+{
+ for(auto i = ctx.earlyCloseFDs.begin(); i != ctx.earlyCloseFDs.end(); i++)
+ if(close(*i) == -1)
+ throw SysError("closing fd");
+
+ if(ctx.logFD != -1) {
+ if(dup2(ctx.logFD, STDOUT_FILENO) == -1)
+ throw SysError("cannot dup2 log fd into stdout fd");
+ if(dup2(ctx.logFD, STDERR_FILENO) == -1)
+ throw SysError("cannot dup2 log fd into stderr fd");
+ }
+
+ if(ctx.setStdin) {
+ if(ctx.stdinFD != -1) {
+ if(dup2(ctx.stdinFD, STDIN_FILENO) == -1)
+ throw SysError("cannot dup2 fd into stdin fd");
+ }
+ else {
+ /* Doesn't make sense for it to be writable, but compatibility... */
+ AutoCloseFD fd = open(ctx.stdinFile.c_str(), O_RDWR);
+ if(fd == -1)
+ throw SysError(format("cannot open `%1%'") % ctx.stdinFile);
+ if(dup2(fd, STDIN_FILENO) == -1)
+ throw SysError("cannot dup2 fd into stdin fd");
+ }
+ }
+}
+
+
+void dropAmbientCapabilitiesAction(SpawnContext & ctx)
+{
+ /* Drop ambient capabilities such as CAP_CHOWN that might have been granted
+ when starting guix-daemon. */
+ if(ctx.dropAmbientCapabilities)
+#if HAVE_SYS_PRCTL_H
+ prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0);
+#else
+ throw Error("dropping ambient capabilities is not supported on this system");
+#endif
+}
+
+
+void chrootAction(SpawnContext & ctx)
+{
+ if(ctx.doChroot)
+#if HAVE_CHROOT
+ if(chroot(ctx.chrootRootDir.c_str()) == -1)
+ throw SysError(format("cannot change root directory to '%1%'") % ctx.chrootRootDir);
+#else
+ throw Error("chroot is not supported on this system");
+#endif
+}
+
+
+void chdirAction(SpawnContext & ctx)
+{
+ if(ctx.setcwd)
+ if(chdir(ctx.cwd.c_str()) == -1)
+ throw SysError(format("changing into `%1%'") % ctx.cwd);
+}
+
+
+void closeMostFDsAction(SpawnContext & ctx)
+{
+ if(ctx.closeMostFDs) closeMostFDs(ctx.preserveFDs);
+ for(auto i = ctx.preserveFDs.begin(); i != ctx.preserveFDs.end(); i++)
+ keepOnExec(*i);
+}
+
+
+void setPersonalityAction(SpawnContext & ctx)
+{
+ if(ctx.setPersona)
+#ifdef __linux__
+ if(personality(ctx.persona) == -1)
+ throw SysError("cannot set personality");
+#else
+ throw Error("setting the personality is not supported on this system");
+#endif
+}
+
+
+void oomSacrificeAction(SpawnContext & ctx)
+{
+#ifdef __linux__
+ if(ctx.oomSacrifice)
+ /* Ask the kernel to eagerly kill us & our children if it runs out of
+ memory, regardless of blame, to preserve ‘real’ user data &
+ state. */
+ try {
+ writeFile("/proc/self/oom_score_adj", "1000"); // 100%
+ } catch(...) { ignoreException(); }
+#endif
+}
+
+
+void setIDsAction(SpawnContext & ctx)
+{
+ if(ctx.setSupplementaryGroups)
+ if(setgroups(ctx.supplementaryGroups.size(),
+ ctx.supplementaryGroups.data()) == -1)
+ throw SysError("cannot set supplementary groups");
+
+ if(ctx.setgid)
+ if(setgid(ctx.group) == -1 ||
+ getgid() != ctx.group ||
+ getegid() != ctx.group)
+ throw SysError("setgid failed");
+
+ if(ctx.setuid)
+ if(setuid(ctx.user) == -1 ||
+ getuid() != ctx.user ||
+ geteuid() != ctx.user)
+ throw SysError("setuid failed");
+}
+
+
+void restoreSIGPIPEAction(SpawnContext & ctx)
+{
+ /* Restore default handling of SIGPIPE, otherwise some programs will
+ randomly say "Broken pipe". */
+ struct sigaction act, oact;
+ act.sa_handler = SIG_DFL;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGPIPE, &act, &oact)) throw SysError("resetting SIGPIPE");
+}
+
+
+void setupSuccessAction(SpawnContext & ctx)
+{
+ if(ctx.signalSetupSuccess)
+ writeFull(STDERR_FILENO, "\n");
+}
+
+
+void execAction(SpawnContext & ctx)
+{
+ Strings envStrs;
+ std::vector<char *> envPtrs;
+ char **env;
+ if(ctx.inheritEnv) {
+ for(auto i = ctx.env.begin(); i != ctx.env.end(); i++)
+ if(setenv(i->first.c_str(), i->second.c_str(), 1) == -1)
+ throw SysError("setenv");
+ env = environ;
+ } else {
+ for(auto i = ctx.env.begin(); i != ctx.env.end(); i++)
+ envStrs.push_back(i->first + "=" + i->second);
+ /* Need to keep the envPtrs vector alive as long as its .data()! */
+ envPtrs = stringsToCharPtrs(envStrs);
+ env = envPtrs.data();
+ }
+ if(execvpe(ctx.program.c_str(), stringsToCharPtrs(ctx.args).data(), env) == -1)
+ throw SysError(format("executing `%1%'") % ctx.program);
+}
+
+
+Phases getBasicSpawnPhases()
+{
+ return { { "reset_writeToStderr", reset_writeToStderrAction },
+ { "restoreAffinity", restoreAffinityAction },
+ { "setsid", setsidAction },
+ { "earlyIOSetup", earlyIOSetupAction },
+ { "dropAmbientCapabilities", dropAmbientCapabilitiesAction },
+ { "chroot", chrootAction },
+ { "chdir", chdirAction },
+ { "closeMostFDs", closeMostFDsAction },
+ { "setPersonality", setPersonalityAction },
+ { "oomSacrifice", oomSacrificeAction },
+ { "setIDs", setIDsAction },
+ { "restoreSIGPIPE", restoreSIGPIPEAction },
+ { "setupSuccess", setupSuccessAction },
+ { "exec", execAction } };
+}
+
+
+void usernsInitSyncAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+ /* Close the earlyCloseFDs before we try reading anything */
+ for(auto i = ctx.earlyCloseFDs.begin(); i != ctx.earlyCloseFDs.end(); i++)
+ if(close(*i) == -1)
+ throw SysError("closing fd");
+ /* Don't try closing them again later */
+ ctx.earlyCloseFDs.clear();
+ /* Wait for the parent process to initialize the UID/GID mapping of
+ our user namespace. */
+ waitForMessage(ctx.setupFD, "go\n");
+ }
+#endif
+}
+
+
+void usernsSetIDsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+ /* Note: 'man capabilities' says that a transition from zero to
+ nonzero uids causes capabilities to be lost, but doesn't say what
+ happens when a transition from an unmapped (possibly zero) uid to a
+ nonzero uid happens. */
+ if(ctx.usernsSetuid)
+ /* Since we presumably have CAP_SETUID, this sets the real,
+ effective, saved, and filesystem uids */
+ if(setuid(ctx.usernsUser) != 0)
+ throw SysError("setuid");
+ if(ctx.usernsSetgid)
+ /* Ditto but with gids */
+ if(setgid(ctx.usernsGroup) != 0)
+ throw SysError("setgid");
+ }
+#endif
+}
+
+
+void initLoopbackAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if(((ctx.cloneFlags & CLONE_NEWNET) != 0) && ctx.initLoopback) {
+ AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP));
+ if (fd == -1) throw SysError("cannot open IP socket");
+
+ struct ifreq ifr;
+ strcpy(ifr.ifr_name, "lo");
+ ifr.ifr_flags = IFF_UP | IFF_LOOPBACK | IFF_RUNNING;
+ if (ioctl(fd, SIOCSIFFLAGS, &ifr) == -1)
+ throw SysError("cannot set loopback interface flags");
+
+ fd.close();
+ }
+#endif
+}
+
+
+void setHostAndDomainAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWUTS) != 0) {
+ if (sethostname(ctx.hostname.c_str(),
+ strlen(ctx.hostname.c_str())) == -1)
+ throw SysError("cannot set host name");
+ if (setdomainname(ctx.domainname.c_str(),
+ strlen(ctx.domainname.c_str())) == -1)
+ throw SysError("cannot set domain name");
+ }
+#endif
+}
+
+
+void makeFilesystemsPrivateAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_REC) && defined(MS_PRIVATE)
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0) {
+ if(mount(0, "/", 0, MS_REC|MS_PRIVATE, 0) == -1)
+ throw SysError("unable to make `/' private mount");
+ }
+#endif
+}
+
+
+void makeChrootSeparateFilesystemAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if(((ctx.cloneFlags & CLONE_NEWNS) != 0) && ctx.doChroot) {
+ /* Bind-mount chroot directory to itself, to treat it as a different
+ filesystem from /, as needed for pivot_root. Alternatively, mount
+ a tmpfs on it. */
+ if(ctx.mountTmpfsOnChroot) {
+ if(mount("none", ctx.chrootRootDir.c_str(), "tmpfs", 0, 0) == -1)
+ throw SysError(format("unable to mount tmpfs on `%1%'") % ctx.chrootRootDir);
+ }
+ else {
+ if(mount(ctx.chrootRootDir.c_str(), ctx.chrootRootDir.c_str(), 0, MS_BIND, 0) == -1)
+ throw SysError(format("unable to bind mount ‘%1%’") % ctx.chrootRootDir);
+ }
+ }
+#endif
+}
+
+
+static int statfsToMountFlags(int f_flags)
+{
+#if HAVE_SYS_MOUNT_H && HAVE_STATVFS
+ int ret = 0;
+#if defined(ST_RDONLY) && defined(MS_RDONLY)
+ if((f_flags & ST_RDONLY) != 0) ret |= MS_RDONLY;
+#endif
+#if defined(ST_NOSUID) && defined(MS_NOSUID)
+ if((f_flags & ST_NOSUID) != 0) ret |= MS_NOSUID;
+#endif
+#if defined(ST_NODEV) && defined(MS_NODEV)
+ if((f_flags & ST_NODEV) != 0) ret |= MS_NODEV;
+#endif
+#if defined(ST_NOEXEC) && defined(MS_NOEXEC)
+ if((f_flags & ST_NOEXEC) != 0) ret |= MS_NOEXEC;
+#endif
+#if defined(ST_NOATIME) && defined(MS_NOATIME)
+ if((f_flags & ST_NOATIME) != 0) ret |= MS_NOATIME;
+#endif
+#if defined(ST_NODIRATIME) && defined(MS_NODIRATIME)
+ if((f_flags & ST_NODIRATIME) != 0) ret |= MS_NODIRATIME;
+#endif
+#if defined(ST_RELATIME) && defined(MS_RELATIME)
+ if((f_flags & ST_RELATIME) != 0) ret |= MS_RELATIME;
+#endif
+ return ret;
+#else
+ throw Error("statfsToMountFlags not supported on this platform");
+#endif
+}
+
+
+void bindMount(Path source, Path target, bool readOnly)
+{
+#if HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ struct stat st;
+ if (lstat(source.c_str(), &st) == -1)
+ throw SysError(format("getting attributes of path `%1%'") % source);
+
+ if(S_ISDIR(st.st_mode))
+ createDirs(target);
+ else if(S_ISLNK(st.st_mode)) {
+ /* bind-mounts follow symlinks, thus representing their target and not
+ the symlink itself. Create a copy of the symlink instead.*/
+ createDirs(dirOf(target));
+ createSymlink(readLink(source), target);
+ return;
+ }
+ else {
+ createDirs(dirOf(target));
+ writeFile(target, "");
+ }
+
+ /* This may fail with EINVAL unless we specify MS_REC, specifically if we
+ are in an unprivileged mount namespace and not specifying MS_REC would
+ reveal subtrees that had been covered up. */
+ if (mount(source.c_str(), target.c_str(), 0, MS_BIND|MS_REC, 0) == -1)
+ throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target);
+ if(readOnly) {
+#if defined(MS_REMOUNT) && defined(MS_RDONLY)
+ /* Extra flags passed with MS_BIND are ignored, hence the extra
+ MS_REMOUNT. */
+ unsigned long mount_flags = MS_BIND | MS_REMOUNT | MS_RDONLY;
+ /* MS_BIND | MS_REMOUNT sets all mountpoint flags, so we may get EPERM
+ unless we preserve the other flags (for example because it would
+ result in trying to clear the nosuid flag). */
+#if HAVE_STATVFS
+ struct statvfs stvfs;
+ if(statvfs(target.c_str(), &stvfs) == -1)
+ throw SysError(format("statvfs of `%1%'") % target);
+ mount_flags |= statfsToMountFlags(stvfs.f_flag);
+#endif
+
+ if (mount(source.c_str(), target.c_str(), 0, mount_flags, 0) == -1)
+ throw SysError(format("read-only remount of `%1%' failed") % target);
+#else
+ throw Error("remounting read-only is not supported on this platform");
+#endif
+ }
+#endif
+}
+
+
+void mountIntoChroot(std::map<Path, Path> filesInChroot,
+ set<Path> readOnlyFiles,
+ Path chrootRootDir)
+{
+#if HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ for(auto i = filesInChroot.begin(); i != filesInChroot.end(); i++) {
+ Path source = i->second;
+ Path target = chrootRootDir + i->first;
+ bool readOnly = readOnlyFiles.find(i->first) != readOnlyFiles.end();
+ bindMount(source, target, readOnly);
+ }
+#else
+ throw Error("bind mounting not supported on this platform");
+#endif
+}
+
+
+void mountIntoChrootAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.doChroot) {
+ mountIntoChroot(ctx.filesInChroot, ctx.readOnlyFilesInChroot, ctx.chrootRootDir);
+ }
+#endif
+}
+
+
+void mountProcAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.mountProc) {
+ Path target = (ctx.doChroot ? ctx.chrootRootDir : "") + "/proc";
+ createDirs(target);
+ if(mount("none", target.c_str(), "proc", 0, 0) == -1)
+ throw SysError(format("mounting `%1%'") % target);
+ }
+#endif
+}
+
+
+void mountDevshmAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.mountDevshm) {
+ Path target = (ctx.doChroot ? ctx.chrootRootDir : "") + "/dev/shm";
+ createDirs(target);
+ if(mount("none", target.c_str(), "tmpfs", 0, 0) == -1)
+ throw SysError(format("mounting `%1%'") % target);
+ }
+#endif
+}
+
+
+void mountDevptsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.maybeMountDevpts) {
+ Path chroot = (ctx.doChroot ? ctx.chrootRootDir : "");
+ Path target = chroot + "/dev/pts";
+ if(pathExists(chroot + "/dev/ptmx")) return;
+ createDirs(target);
+ if(mount("none", target.c_str(), "devpts", 0, "newinstance,mode=0620") == -1)
+ throw SysError(format("mounting `%1%'") % target);
+ createSymlink("/dev/pts/ptmx", chroot + "/dev/ptmx");
+ /* Make sure /dev/pts/ptmx is world-writable. With some Linux
+ versions, it is created with permissions 0. */
+ Path targetPtmx = chroot + "/dev/pts/ptmx";
+ if (chmod(targetPtmx.c_str(), 0666) == -1)
+ throw SysError(format("setting permissions on `%1%'") % targetPtmx);
+ }
+#endif
+}
+
+
+void pivotRootAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.doChroot) {
+ if (chdir(ctx.chrootRootDir.c_str()) == -1)
+ throw SysError(format("cannot change directory to '%1%'") % ctx.chrootRootDir);
+
+ if (mkdir("real-root", 0) == -1)
+ throw SysError("cannot create real-root directory");
+
+ if (pivot_root(".", "real-root") == -1)
+ throw SysError(format("cannot pivot old root directory onto '%1%'") % (ctx.chrootRootDir + "/real-root"));
+
+ if (chroot(".") == -1)
+ throw SysError(format("cannot change root directory to '%1%'") % ctx.chrootRootDir);
+
+ if (umount2("real-root", MNT_DETACH) == -1)
+ throw SysError("cannot unmount real root filesystem");
+
+ if (rmdir("real-root") == -1)
+ throw SysError("cannot remove real-root directory");
+ }
+#endif
+}
+
+
+string idMapToIdentityMap(const string & map)
+{
+ std::vector<string> mapLines =
+ tokenizeString<std::vector<string> >(map, "\n");
+ string out;
+
+ for(auto & i : mapLines) {
+ std::vector<string> elements =
+ tokenizeString<std::vector<string> >(i, " ");
+ out.append(elements.at(0) + " " + elements.at(0) + " " + elements.at(2) + "\n");
+ }
+ return out;
+}
+
+
+/* Initializing a user namespace with more than one id mapped requires
+ * capabilities in the *parent* user namespace, which may not even have any
+ * processes in it after unshare is called. So fork a child and have it do
+ * the initialization. */
+void unshareAndInitUserns(int flags, const string & uidMap,
+ const string & gidMap, bool allowSetgroups)
+{
+#if CLONE_ENABLED
+ pid_t pid_ = getpid();
+ string pid = std::to_string(pid_);
+ Pipe toChild;
+ Pipe fromChild;
+ toChild.create();
+ fromChild.create();
+ pid_t child = fork();
+ if(child == -1)
+ throw SysError("creating child process");
+ if(child == 0) {
+ try {
+ toChild.writeSide.close();
+ fromChild.readSide.close();
+ waitForMessage(toChild.readSide, "ready\n");
+ writeFile("/proc/" + pid + "/uid_map", uidMap);
+ writeFile("/proc/" + pid + "/setgroups",
+ allowSetgroups ? "allow" : "deny");
+ writeFile("/proc/" + pid + "/gid_map", gidMap);
+ writeFull(fromChild.writeSide, (unsigned char*)"go\n", 3);
+ } catch(...) {
+ /* Don't unwind the stack in case of exception, halt
+ * immediately. */
+ _exit(1);
+ }
+ _exit(EXIT_SUCCESS);
+ } else {
+ toChild.readSide.close();
+ fromChild.writeSide.close();
+ if(unshare(flags) == -1)
+ throw SysError("unshare");
+ writeFull(toChild.writeSide, (unsigned char*)"ready\n", 6);
+ waitForMessage(fromChild.readSide, "go\n");
+ int status;
+ while(waitpid(child, &status, 0) == -1) {
+ if(errno != EINTR)
+ throw SysError("reaping userns init process");
+ }
+ if(!(WIFEXITED(status) != 0 && WEXITSTATUS(status) == EXIT_SUCCESS))
+ throw Error(format("userns init child exited with status %1%") % WEXITSTATUS(status));
+ }
+#endif
+}
+
+
+void lockMountsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if(ctx.lockMounts) {
+ string uidMap;
+ string gidMap;
+ if(ctx.lockMountsMapAll) {
+ string oldUidMap = readFile("/proc/self/uid_map", true);
+ string oldGidMap = readFile("/proc/self/gid_map", true);
+ uidMap = idMapToIdentityMap(oldUidMap);
+ gidMap = idMapToIdentityMap(oldGidMap);
+ } else {
+ string uid = std::to_string(getuid());
+ string gid = std::to_string(getgid());
+ uidMap = uid + " " + uid + " 1";
+ gidMap = gid + " " + gid + " 1";
+ }
+ unshareAndInitUserns(CLONE_NEWNS | CLONE_NEWUSER,
+ uidMap, gidMap, ctx.lockMountsAllowSetgroups);
+ /* Check that mounts inherited in our new mount namespace are "locked"
+ together and cannot be separated from within our mount namespace.
+ Since umount(2) is documented to fail with EINVAL when attempting
+ to unmount one of the mounts that are locked together, check that
+ this is what we get. */
+ int ret = umount("/proc");
+ assert(ret == -1 && errno == EINVAL);
+ }
+#endif
+}
+
+
+Phases getCloneSpawnPhases()
+{
+#if CLONE_ENABLED
+ return { { "reset_writeToStderr", reset_writeToStderrAction },
+ { "usernsInitSync", usernsInitSyncAction },
+ { "usernsSetIDs", usernsSetIDsAction },
+ { "restoreAffinity", restoreAffinityAction },
+ { "setsid", setsidAction },
+ { "earlyIOSetup", earlyIOSetupAction },
+ { "dropAmbientCapabilities", dropAmbientCapabilitiesAction },
+ { "initLoopback", initLoopbackAction },
+ { "setHostAndDomain", setHostAndDomainAction },
+ { "makeFilesystemsPrivate", makeFilesystemsPrivateAction },
+ { "makeChrootSeparateFilesystem", makeChrootSeparateFilesystemAction },
+ { "mountIntoChroot", mountIntoChrootAction },
+ { "mountProc", mountProcAction },
+ { "mountDevshm", mountDevshmAction },
+ { "mountDevpts", mountDevptsAction },
+ { "chroot", pivotRootAction },
+ { "chdir", chdirAction },
+ { "closeMostFDs", closeMostFDsAction },
+ { "setPersonality", setPersonalityAction },
+ { "oomSacrifice", oomSacrificeAction },
+ /* Being put in a user namespace with only the current ids mapped
+ would tend to prevent switching to other ones, but if this
+ comes after setIDs then the per-process "dumpable" flag may be
+ reset, which will cause /proc/self to become root-owned,
+ making /proc/self/uid_map inaccessible. If you need
+ lockMounts to preserve the id mappings, and you have the
+ necessary capabilities in the parent user namespace, set
+ CloneSpawnContext.lockMountsMapAll = true. */
+ { "lockMounts", lockMountsAction },
+ { "setIDs", setIDsAction },
+ { "restoreSIGPIPE", restoreSIGPIPEAction },
+ { "setupSuccess", setupSuccessAction },
+ { "exec", execAction }};
+#else
+ throw Error("clone not supported on this platform");
+#endif
+}
+
+
+void runChildSetup(SpawnContext & ctx)
+{
+ ctx.currentPhase = 0;
+ try {
+ /* Should not return regularly from this */
+ while(true) {
+ ctx.phases.at(ctx.currentPhase).action(ctx);
+ ctx.currentPhase++;
+ }
+ } catch (std::exception & e) {
+ try {
+ writeFull(STDERR_FILENO,
+ "while setting up the child process: " +
+ (ctx.currentPhase < (ssize_t)ctx.phases.size() ?
+ "in phase " + ctx.phases[ctx.currentPhase].label + ": " : "") +
+ string(e.what()) + "\n");
+ } catch (std::exception & e2) {
+ _exit(1);
+ }
+ _exit(1);
+ }
+ abort(); /* Should never be reached */
+}
+
+
+int runChildSetupEntry(void *data)
+{
+ runChildSetup(* (SpawnContext *)data);
+ return 1;
+}
+
+
+int cloneChild(CloneSpawnContext & ctx)
+{
+ char stack[32 * 1024];
+ /* Ensure proper alignment on the stack. On aarch64, it has to be 16
+ bytes. */
+ char *alignedStack = (char *)(((uintptr_t)stack + sizeof(stack) - 8) & ~(uintptr_t)0xf);
+ int ret = clone(runChildSetupEntry, alignedStack, ctx.cloneFlags, (void *) &ctx);
+ if(ret == -1)
+ throw SysError("clone");
+ return ret;
+}
+
+}