summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReepca Russelstein <reepca@russelstein.xyz>2025-04-17 23:32:03 -0500
committerJohn Kehayias <john.kehayias@protonmail.com>2025-06-24 10:07:56 -0400
commitbe8aca065118aa4485c02f991c51bea89034defa (patch)
tree79331f4087c66b62b47aa98100d57cb43e604163
parent7173c2c0cad8afc9d8d1ad26f345b5a04f47716a (diff)
daemon: add and use spawn.cc and spawn.hh.
This adds a mechanism for manipulating and running "spawn phases" similarly to how builder-side code manipulates "build phases". The main difference is that spawn phases take a (reference to a) single structure that they can both read from and write to, with their writes being visible to subsequent phases. The base structure type for this is SpawnContext. It also adds some predefined phase sequences, namely basicSpawnPhases and cloneSpawnPhases, and exposes each of the actions performed by these phases. Finally, it modifies build.cc to replace runChild() with use of this new code. * nix/libutil/util.cc (keepOnExec, waitForMessage): new functions. * nix/libutil.util.hh (keepOnExec, waitForMessage): add prototypes. * nix/libutil/spawn.cc, nix/libutil/spawn.hh: new files. (addPhaseAfter, addPhaseBefore, prependPhase, appendPhase, deletePhase, replacePhase, reset_writeToStderrAction, restoreAffinityAction, setsidAction, earlyIOSetupAction, dropAmbientCapabilitiesAction, chrootAction, chdirAction, closeMostFDsAction, setPersonalityAction, oomSacrificeAction, setIDsAction, restoreSIGPIPEAction, setupSuccessAction, execAction, getBasicSpawnPhases, usernsInitSyncAction, usernsSetIDsAction, initLoopbackAction, setHostAndDomainAction, makeFilesystemsPrivateAction, makeChrootSeparateFilesystemAction, statfsToMountFlags, bindMount, mountIntoChroot, mountIntoChrootAction, mountProcAction, mountDevshmAction, mountDevptsAction, pivotRootAction, lockMountsAction, getCloneSpawnPhases, runChildSetup, runChildSetupEntry, cloneChild, idMapToIdentityMap, unshareAndInitUserns): new procedures. * nix/local.mk (libutil_a_SOURCES): add spawn.cc. (libutil_headers): add spawn.hh. * nix/libstore/build.cc (restoreSIGPIPE, DerivationGoal::runChild, childEntry): removed procedures. (DerivationGoal::{dirsInChroot,env,readiness}): removed. (execBuilderOrBuiltin, execBuilderOrBuiltinAction, clearRootWritePermsAction): new procedures. (DerivationGoal::startBuilder): modified to use a CloneSpawnContext if chroot builds are available, otherwise a SpawnContext. Change-Id: Ifd50110de077378ee151502eda62b99973d083bf Change-Id: I76e10d3f928cc30566e1e6ca79077196972349f8 spawn.cc, util.cc, util.hh changes Change-Id: I287320e63197cb4f65665ee5b3fdb3a0e125ebac Signed-off-by: John Kehayias <john.kehayias@protonmail.com>
-rw-r--r--nix/libstore/build.cc788
-rw-r--r--nix/libutil/spawn.cc829
-rw-r--r--nix/libutil/spawn.hh164
-rw-r--r--nix/libutil/util.cc19
-rw-r--r--nix/libutil/util.hh5
-rw-r--r--nix/local.mk6
6 files changed, 1303 insertions, 508 deletions
diff --git a/nix/libstore/build.cc b/nix/libstore/build.cc
index afdcd9518b..51f5aed106 100644
--- a/nix/libstore/build.cc
+++ b/nix/libstore/build.cc
@@ -9,6 +9,7 @@
#include "archive.hh"
#include "affinity.hh"
#include "builtins.hh"
+#include "spawn.hh"
#include <map>
#include <sstream>
@@ -402,22 +403,6 @@ void Goal::trace(const format & f)
}
-
-//////////////////////////////////////////////////////////////////////
-
-
-/* Restore default handling of SIGPIPE, otherwise some programs will
- randomly say "Broken pipe". */
-static void restoreSIGPIPE()
-{
- struct sigaction act, oact;
- act.sa_handler = SIG_DFL;
- act.sa_flags = 0;
- sigemptyset(&act.sa_mask);
- if (sigaction(SIGPIPE, &act, &oact)) throw SysError("resetting SIGPIPE");
-}
-
-
//////////////////////////////////////////////////////////////////////
@@ -679,12 +664,6 @@ private:
typedef void (DerivationGoal::*GoalState)();
GoalState state;
- /* Stuff we need to pass to runChild(). */
- typedef map<Path, Path> DirsInChroot; // maps target path to source path
- DirsInChroot dirsInChroot;
- typedef map<string, string> Environment;
- Environment env;
-
/* Hash rewriting. */
HashRewrites rewritesToTmp, rewritesFromTmp;
typedef map<Path, Path> RedirectedOutputs;
@@ -753,14 +732,9 @@ private:
/* Start building a derivation. */
void startBuilder();
- /* Run the builder's process. */
- void runChild();
-
- friend int childEntry(void *);
+ void execBuilderOrBuiltin(SpawnContext &);
- /* Pipe to notify readiness to the child process when using unprivileged
- user namespaces. */
- Pipe readiness;
+ friend void execBuilderOrBuiltinAction(SpawnContext &);
/* Check that the derivation outputs all exist and register them
as valid. */
@@ -1630,13 +1604,6 @@ void chmod_(const Path & path, mode_t mode)
}
-int childEntry(void * arg)
-{
- ((DerivationGoal *) arg)->runChild();
- return 1;
-}
-
-
/* UID and GID of the build user inside its own user namespace. */
static const uid_t guestUID = 30001;
static const gid_t guestGID = 30000;
@@ -1655,6 +1622,105 @@ static void initializeUserNamespace(pid_t child,
(format("%d %d 1") % guestGID % hostGID).str());
}
+#if CHROOT_ENABLED
+
+void clearRootWritePermsAction(SpawnContext & sctx)
+{
+ if(chmod("/", 0555) == -1)
+ throw SysError("changing mode of chroot root directory");
+}
+
+#endif /* CHROOT_ENABLED */
+
+/* Return true if the operating system kernel part of SYSTEM1 and SYSTEM2 (the
+ bit that comes after the hyphen in system types such as "i686-linux") is
+ the same. */
+static bool sameOperatingSystemKernel(const std::string& system1, const std::string& system2)
+{
+ auto os1 = system1.substr(system1.find("-"));
+ auto os2 = system2.substr(system2.find("-"));
+ return os1 == os2;
+}
+
+
+void DerivationGoal::execBuilderOrBuiltin(SpawnContext & ctx)
+{
+ if(isBuiltin(drv)) {
+ /* Note: must not return from this block */
+ try {
+ logType = ltFlat;
+
+ auto buildDrv = lookupBuiltinBuilder(drv.builder);
+ if (buildDrv != NULL) {
+ /* Check what the output file name is. When doing a 'bmCheck'
+ build, the output file name is different from that
+ specified in DRV due to hash rewriting. */
+ Path output = drv.outputs["out"].path;
+ auto redirected = redirectedOutputs.find(output);
+ if (redirected != redirectedOutputs.end())
+ output = redirected->second;
+
+ buildDrv(drv, drvPath, output);
+ }
+ else
+ throw Error(format("unsupported builtin function '%1%'") % string(drv.builder, 8));
+ _exit(0);
+ } catch (std::exception & e) {
+ writeFull(STDERR_FILENO, "error: " + string(e.what()) + "\n");
+ _exit(1);
+ }
+ }
+ /* Ensure that the builder is within the store. This prevents users from
+ using /proc/self/exe (or a symlink to it) as their builder, which could
+ allow them to overwrite the guix-daemon binary (CVE-2019-5736).
+
+ This attack is possible even if the target of /proc/self/exe is outside
+ the chroot (it's as if it were a hard link), though it requires that
+ its ELF interpreter and dependencies be in the chroot.
+
+ Note: 'canonPath' throws if 'ctx.program' cannot be resolved within the
+ chroot. */
+ ctx.program = canonPath(ctx.program, true);
+ if(!isInStore(ctx.program))
+ throw Error(format("derivation builder `%1' is outside the store") % ctx.program);
+ /* If DRV targets the same operating system kernel, try to execute it:
+ there might be binfmt_misc set up for user-land emulation of other
+ architectures. However, if it targets a different operating
+ system--e.g., "i586-gnu" vs. "x86_64-linux"--do not try executing it:
+ the ELF file for that OS is likely indistinguishable from a native ELF
+ binary and it would just crash at run time. */
+ int error;
+ if (sameOperatingSystemKernel(drv.platform, settings.thisSystem)) {
+ try {
+ execAction(ctx);
+ error = errno;
+ } catch(SysError & e) {
+ error = e.errNo;
+ }
+ } else {
+ error = ENOEXEC;
+ }
+ /* Right platform? Check this after we've tried 'execve' to allow for
+ transparent emulation of different platforms with binfmt_misc handlers
+ that invoke QEMU. */
+ if (error == ENOEXEC && !canBuildLocally(drv.platform)) {
+ if (settings.printBuildTrace)
+ printMsg(lvlError, format("@ unsupported-platform %1% %2%") % drvPath % drv.platform);
+ throw Error(format("a `%1%' is required to build `%3%', but I am a `%2%'")
+ % drv.platform % settings.thisSystem % drvPath);
+ }
+
+ errno = error;
+ throw SysError(format("executing `%1%'") % drv.builder);
+}
+
+
+void execBuilderOrBuiltinAction(SpawnContext & ctx)
+{
+ ((DerivationGoal *)ctx.extraData)->execBuilderOrBuiltin(ctx);
+}
+
+
void DerivationGoal::startBuilder()
{
auto f = format(
@@ -1665,6 +1731,57 @@ void DerivationGoal::startBuilder()
f.exceptions(boost::io::all_error_bits ^ boost::io::too_many_args_bit);
startNest(nest, lvlInfo, f % showPaths(missingPaths) % curRound % nrRounds);
+ /* A CloneSpawnContext reference can be passed to procedures expecting a
+ SpawnContext reference */
+#if CHROOT_ENABLED
+ CloneSpawnContext ctx;
+#else
+ SpawnContext ctx;
+#endif
+
+ ctx.extraData = (void *) this;
+ ctx.setsid = true;
+ ctx.oomSacrifice = true;
+ ctx.signalSetupSuccess = true;
+ ctx.setStdin = true;
+ ctx.stdinFile = "/dev/null";
+ ctx.closeMostFDs = true;
+ ctx.program = drv.builder;
+ ctx.args = drv.args;
+ if(!isBuiltin(drv))
+ ctx.args.insert(ctx.args.begin(), baseNameOf(drv.builder));
+
+#if __linux__
+ ctx.dropAmbientCapabilities = true;
+ ctx.persona = PER_LINUX; /* default */
+ ctx.setPersona = true;
+ /* Change the personality to 32-bit if we're doing an
+ i686-linux build on an x86_64-linux machine. */
+ struct utsname utsbuf;
+ uname(&utsbuf);
+ if (drv.platform == "i686-linux" &&
+ (settings.thisSystem == "x86_64-linux" ||
+ (!strcmp(utsbuf.sysname, "Linux") && !strcmp(utsbuf.machine, "x86_64")))) {
+ ctx.persona = PER_LINUX32;
+ }
+
+ if (drv.platform == "armhf-linux" &&
+ (settings.thisSystem == "aarch64-linux" ||
+ (!strcmp(utsbuf.sysname, "Linux") && !strcmp(utsbuf.machine, "aarch64")))) {
+ ctx.persona = PER_LINUX32;
+ }
+
+ /* Impersonate a Linux 2.6 machine to get some determinism in
+ builds that depend on the kernel version. */
+ if ((drv.platform == "i686-linux" || drv.platform == "x86_64-linux") && settings.impersonateLinux26) {
+ ctx.persona |= 0x0020000; /* == UNAME26 */
+ }
+
+ /* Disable address space randomization for improved determinism. */
+ ctx.persona |= ADDR_NO_RANDOMIZE;
+
+#endif
+
/* Note: built-in builders are *not* running in a chroot environment so
that we can easily implement them in Guile without having it as a
derivation input (they are running under a separate build user,
@@ -1672,12 +1789,13 @@ void DerivationGoal::startBuilder()
useChroot = settings.useChroot && !isBuiltin(drv);
/* Construct the environment passed to the builder. */
- env.clear();
+ ctx.env.clear();
+ ctx.inheritEnv = false;
/* Most shells initialise PATH to some default (/bin:/usr/bin:...) when
PATH is not set. We don't want this, so we fill it in with some dummy
value. */
- env["PATH"] = "/path-not-set";
+ ctx.env["PATH"] = "/path-not-set";
/* Set HOME to a non-existing path to prevent certain programs from using
/etc/passwd (or NIS, or whatever) to locate the home directory (for
@@ -1686,20 +1804,20 @@ void DerivationGoal::startBuilder()
they are looking for does not exist if HOME is set but points to some
non-existing path. */
Path homeDir = "/homeless-shelter";
- env["HOME"] = homeDir;
+ ctx.env["HOME"] = homeDir;
/* Tell the builder where the store is. Usually they
shouldn't care, but this is useful for purity checking (e.g.,
the compiler or linker might only want to accept paths to files
in the store or in the build directory). */
- env["NIX_STORE"] = settings.nixStore;
+ ctx.env["NIX_STORE"] = settings.nixStore;
/* The maximum number of cores to utilize for parallel building. */
- env["NIX_BUILD_CORES"] = (format("%d") % settings.buildCores).str();
+ ctx.env["NIX_BUILD_CORES"] = (format("%d") % settings.buildCores).str();
/* Add all bindings specified in the derivation. */
for (auto& i : drv.env)
- env[i.first] = i.second;
+ ctx.env[i.first] = i.second;
/* Create a temporary directory where the build will take
place. */
@@ -1728,18 +1846,20 @@ void DerivationGoal::startBuilder()
directory. */
tmpDirInSandbox = useChroot ? canonPath("/tmp", true) + "/guix-build-" + drvName + "-0" : tmpDir;
+ ctx.setcwd = true;
+ ctx.cwd = tmpDirInSandbox;
/* For convenience, set an environment pointing to the top build
directory. */
- env["NIX_BUILD_TOP"] = tmpDirInSandbox;
+ ctx.env["NIX_BUILD_TOP"] = tmpDirInSandbox;
/* Also set TMPDIR and variants to point to this directory. */
- env["TMPDIR"] = env["TEMPDIR"] = env["TMP"] = env["TEMP"] = tmpDirInSandbox;
+ ctx.env["TMPDIR"] = ctx.env["TEMPDIR"] = ctx.env["TMP"] = ctx.env["TEMP"] = tmpDirInSandbox;
/* Explicitly set PWD to prevent problems with chroot builds. In
particular, dietlibc cannot figure out the cwd because the
inode of the current directory doesn't appear in .. (because
getdents returns the inode of the mount point). */
- env["PWD"] = tmpDirInSandbox;
+ ctx.env["PWD"] = tmpDirInSandbox;
/* *Only* if this is a fixed-output derivation, propagate the
values of the environment variables specified in the
@@ -1752,7 +1872,7 @@ void DerivationGoal::startBuilder()
already know the cryptographic hash of the output). */
if (fixedOutput) {
Strings varNames = tokenizeString<Strings>(get(drv.env, "impureEnvVars"));
- for (auto& i : varNames) env[i] = getEnv(i);
+ for (auto& i : varNames) ctx.env[i] = getEnv(i);
}
/* The `exportReferencesGraph' feature allows the references graph
@@ -1816,10 +1936,20 @@ void DerivationGoal::startBuilder()
/* Change ownership of the temporary build directory. */
if (chown(tmpDir.c_str(), buildUser.getUID(), buildUser.getGID()) == -1)
throw SysError(format("cannot change ownership of '%1%'") % tmpDir);
+
+ ctx.setuid = true;
+ ctx.user = buildUser.getUID();
+ ctx.setgid = true;
+ ctx.group = buildUser.getGID();
+ ctx.setSupplementaryGroups = true;
+ ctx.supplementaryGroups = buildUser.getSupplementaryGIDs();
}
if (useChroot) {
#if CHROOT_ENABLED
+ ctx.phases = getCloneSpawnPhases();
+ addPhaseAfter(ctx.phases, "chroot", "clearRootWritePerms",
+ clearRootWritePermsAction);
/* Create a temporary directory in which we set up the chroot
environment using bind-mounts. Put it in the store to ensure it
can be atomically moved to the store. */
@@ -1830,6 +1960,22 @@ void DerivationGoal::startBuilder()
/* Clean up the chroot directory automatically. */
autoDelChroot = std::shared_ptr<AutoDelete>(new AutoDelete(chrootRootTop));
+ ctx.doChroot = true;
+ ctx.chrootRootDir = chrootRootDir;
+ ctx.cloneFlags = CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWIPC | CLONE_NEWUTS | SIGCHLD;
+
+ if(!fixedOutput) {
+ ctx.initLoopback = true;
+ ctx.cloneFlags |= CLONE_NEWNET;
+ }
+
+ if(!buildUser.enabled())
+ ctx.cloneFlags |= CLONE_NEWUSER;
+
+ /* Set the hostname etc. to fixed values. */
+ ctx.hostname = "localhost";
+ ctx.domainname = "(none)"; /* kernel default */
+
printMsg(lvlChatty, format("setting up chroot environment in `%1%'") % chrootRootDir);
if (mkdir(chrootRootTop.c_str(), 0750) == -1)
@@ -1865,9 +2011,21 @@ void DerivationGoal::startBuilder()
(format("nixbld:!:%1%:\n")
% (buildUser.enabled() ? buildUser.getGID() : guestGID)).str());
- /* Create /etc/hosts with localhost entry. */
- if (!fixedOutput)
+ if (fixedOutput) {
+ /* Fixed-output derivations typically need to access the network,
+ so give them access to /etc/resolv.conf and so on. */
+ auto files = { "/etc/resolv.conf", "/etc/nsswitch.conf",
+ "/etc/services", "/etc/hosts" };
+ for (auto & file: files) {
+ if (pathExists(file)) {
+ ctx.filesInChroot[file] = file;
+ ctx.readOnlyFilesInChroot.insert(file);
+ }
+ }
+ } else {
+ /* Create /etc/hosts with localhost entry. */
writeFile(chrootRootDir + "/etc/hosts", "127.0.0.1 localhost\n");
+ }
/* Bind-mount a user-configurable set of directories from the
host file system. */
@@ -1877,11 +2035,11 @@ void DerivationGoal::startBuilder()
for (auto & i : dirs) {
size_t p = i.find('=');
if (p == string::npos)
- dirsInChroot[i] = i;
+ ctx.filesInChroot[i] = i;
else
- dirsInChroot[string(i, 0, p)] = string(i, p + 1);
+ ctx.filesInChroot[string(i, 0, p)] = string(i, p + 1);
}
- dirsInChroot[tmpDirInSandbox] = tmpDir;
+ ctx.filesInChroot[tmpDirInSandbox] = tmpDir;
/* Create the fake store. */
Path chrootStoreDir = chrootRootDir + settings.nixStore;
@@ -1897,22 +2055,8 @@ void DerivationGoal::startBuilder()
the whole store. This prevents any access to undeclared
dependencies. */
for (auto& i : inputPaths) {
- struct stat st;
- if (lstat(i.c_str(), &st))
- throw SysError(format("getting attributes of path `%1%'") % i);
-
- if (S_ISLNK(st.st_mode)) {
- /* Since bind-mounts follow symlinks, thus representing their
- target and not the symlink itself, special-case
- symlinks. XXX: When running unprivileged, TARGET can be
- deleted by the build process. Use 'open_tree' & co. when
- it's more widely available. */
- Path target = chrootRootDir + i;
- if (symlink(readLink(i).c_str(), target.c_str()) == -1)
- throw SysError(format("failed to create symlink '%1%' to '%2%'") % target % readLink(i));
- }
- else
- dirsInChroot[i] = i;
+ ctx.filesInChroot[i] = i;
+ ctx.readOnlyFilesInChroot.insert(i);
}
/* If we're repairing, checking or rebuilding part of a
@@ -1921,14 +2065,56 @@ void DerivationGoal::startBuilder()
(typically the dependencies of /bin/sh). Throw them
out. */
for (auto & i : drv.outputs)
- dirsInChroot.erase(i.second.path);
+ ctx.filesInChroot.erase(i.second.path);
+ /* Set up a nearly empty /dev, unless the user asked to bind-mount the
+ host /dev. */
+ Strings ss;
+ if(ctx.filesInChroot.find("/dev") == ctx.filesInChroot.end()) {
+ createDirs(chrootRootDir + "/dev/shm");
+ createDirs(chrootRootDir + "/dev/pts");
+ ss.push_back("/dev/full");
+#ifdef __linux__
+ if (pathExists("/dev/kvm"))
+ ss.push_back("/dev/kvm");
+#endif
+ ss.push_back("/dev/null");
+ ss.push_back("/dev/random");
+ ss.push_back("/dev/tty");
+ ss.push_back("/dev/urandom");
+ ss.push_back("/dev/zero");
+ createSymlink("/proc/self/fd", chrootRootDir + "/dev/fd");
+ createSymlink("/proc/self/fd/0", chrootRootDir + "/dev/stdin");
+ createSymlink("/proc/self/fd/1", chrootRootDir + "/dev/stdout");
+ createSymlink("/proc/self/fd/2", chrootRootDir + "/dev/stderr");
+ }
+
+ for (auto & i : ss) ctx.filesInChroot[i] = i;
+
+ ctx.mountProc = true;
+ ctx.mountDevshm = true;
+ /* Mount a new devpts on /dev/pts. Note that this requires the kernel
+ to be compiled with CONFIG_DEVPTS_MULTIPLE_INSTANCES=y (which is
+ the case if /dev/ptx/ptmx exists). */
+ ctx.maybeMountDevpts =
+ pathExists("/dev/pts/ptmx") &&
+ ctx.filesInChroot.find("/dev/pts") == ctx.filesInChroot.end();
+ ctx.lockMounts = !buildUser.enabled();
+
+ for (auto & i : ctx.filesInChroot) {
+ /* Failsafe: If the source is in the store, it should be
+ read-only */
+ if(i.second.compare(0, settings.nixStore.length(), settings.nixStore) == 0) {
+ ctx.readOnlyFilesInChroot.insert(i.first);
+ }
+ }
#else
throw Error("chroot builds are not supported on this platform");
#endif
}
else {
+ ctx.phases = getBasicSpawnPhases();
if (pathExists(homeDir))
throw Error(format("directory `%1%' exists; please remove it") % homeDir);
@@ -1956,6 +2142,7 @@ void DerivationGoal::startBuilder()
}
}
+ replacePhase(ctx.phases, "exec", execBuilderOrBuiltinAction);
/* Run the builder. */
printMsg(lvlChatty, format("executing builder `%1%'") % drv.builder);
@@ -1966,6 +2153,9 @@ void DerivationGoal::startBuilder()
/* Create a pipe to get the output of the builder. */
builderOut.create();
+ ctx.logFD = builderOut.writeSide;
+ ctx.earlyCloseFDs.insert(builderOut.readSide);
+
/* Fork a child to build the package. Note that while we
currently use forks to run and wait for the children, it
shouldn't be hard to use threads for this on systems where
@@ -1997,43 +2187,35 @@ void DerivationGoal::startBuilder()
*/
#if __linux__
if (useChroot) {
- char stack[32 * 1024];
- int flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | SIGCHLD;
- if (!fixedOutput) {
- flags |= CLONE_NEWNET;
- }
- if (!buildUser.enabled() || getuid() != 0) {
- flags |= CLONE_NEWUSER;
- readiness.create();
- }
+ int fds[2];
+ AutoCloseFD parentSetupSocket;
+ AutoCloseFD childSetupSocket;
- /* Ensure proper alignment on the stack. On aarch64, it has to be 16
- bytes. */
- pid = clone(childEntry,
- (char *)(((uintptr_t)stack + sizeof(stack) - 8) & ~(uintptr_t)0xf),
- flags, this);
- if (pid == -1) {
- if ((flags & CLONE_NEWUSER) != 0 && getuid() != 0)
- /* 'clone' fails with EPERM on distros where unprivileged user
- namespaces are disabled. Error out instead of giving up on
- isolation. */
- throw SysError("cannot create process in unprivileged user namespace");
- else
- throw SysError("cloning builder process");
- }
+ if(((ctx.cloneFlags & CLONE_NEWUSER) != 0)) {
+ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, fds))
+ throw SysError("creating setup socket");
+ parentSetupSocket = fds[0];
+ closeOnExec(parentSetupSocket);
+ ctx.earlyCloseFDs.insert(parentSetupSocket);
+ childSetupSocket = fds[1];
+ closeOnExec(childSetupSocket);
+ ctx.setupFD = childSetupSocket;
+ }
- readiness.readSide.close();
- if ((flags & CLONE_NEWUSER) != 0) {
- /* Initialize the UID/GID mapping of the child process. */
- initializeUserNamespace(pid);
- writeFull(readiness.writeSide, (unsigned char*)"go\n", 3);
- }
- readiness.writeSide.close();
+ pid = cloneChild(ctx);
+
+ if(childSetupSocket >= 0) childSetupSocket.close();
+
+ if ((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+ /* Initialize the UID/GID mapping of the builder. */
+ initializeUserNamespace(pid);
+ writeFull(parentSetupSocket, (unsigned char*)"go\n", 3);
+ }
} else
#endif
{
pid = fork();
- if (pid == 0) runChild();
+ if (pid == 0) runChildSetup(ctx);
}
if (pid == -1) throw SysError("unable to fork");
@@ -2049,415 +2231,9 @@ void DerivationGoal::startBuilder()
if (settings.printBuildTrace) {
printMsg(lvlError, format("@ build-started %1% - %2% %3% %4%")
- % drvPath % drv.platform % logFile % pid);
- }
-
-}
-
-/* Return true if the operating system kernel part of SYSTEM1 and SYSTEM2 (the
- bit that comes after the hyphen in system types such as "i686-linux") is
- the same. */
-static bool sameOperatingSystemKernel(const std::string& system1, const std::string& system2)
-{
- auto os1 = system1.substr(system1.find("-"));
- auto os2 = system2.substr(system2.find("-"));
- return os1 == os2;
-}
-
-void DerivationGoal::runChild()
-{
- /* Warning: in the child we should absolutely not make any SQLite
- calls! */
-
- try { /* child */
-
- _writeToStderr = 0;
-
- if (readiness.writeSide >= 0) readiness.writeSide.close();
-
- if (readiness.readSide >= 0) {
- /* Wait for the parent process to initialize the UID/GID mapping
- of our user namespace. */
- char str[20] = { '\0' };
- readFull(readiness.readSide, (unsigned char*)str, 3);
- readiness.readSide.close();
- if (strcmp(str, "go\n") != 0)
- throw Error("failed to initialize process in unprivileged user namespace");
- }
-
- restoreAffinity();
-
- commonChildInit(builderOut);
-
-#if CHROOT_ENABLED
- if (useChroot) {
-# if HAVE_SYS_PRCTL_H
- /* Drop ambient capabilities such as CAP_CHOWN that might have
- been granted when starting guix-daemon. */
- prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0);
-# endif
-
- if (!fixedOutput) {
- /* Initialise the loopback interface. */
- AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP));
- if (fd == -1) throw SysError("cannot open IP socket");
-
- struct ifreq ifr;
- strcpy(ifr.ifr_name, "lo");
- ifr.ifr_flags = IFF_UP | IFF_LOOPBACK | IFF_RUNNING;
- if (ioctl(fd, SIOCSIFFLAGS, &ifr) == -1)
- throw SysError("cannot set loopback interface flags");
-
- fd.close();
- }
-
- /* Set the hostname etc. to fixed values. */
- char hostname[] = "localhost";
- if (sethostname(hostname, sizeof(hostname)) == -1)
- throw SysError("cannot set host name");
- char domainname[] = "(none)"; // kernel default
- if (setdomainname(domainname, sizeof(domainname)) == -1)
- throw SysError("cannot set domain name");
-
- /* Make all filesystems private. This is necessary
- because subtrees may have been mounted as "shared"
- (MS_SHARED). (Systemd does this, for instance.) Even
- though we have a private mount namespace, mounting
- filesystems on top of a shared subtree still propagates
- outside of the namespace. Making a subtree private is
- local to the namespace, though, so setting MS_PRIVATE
- does not affect the outside world. */
- if (mount(0, "/", 0, MS_REC|MS_PRIVATE, 0) == -1) {
- throw SysError("unable to make ‘/’ private mount");
- }
-
- /* Bind-mount chroot directory to itself, to treat it as a
- different filesystem from /, as needed for pivot_root. */
- if (mount(chrootRootDir.c_str(), chrootRootDir.c_str(), 0, MS_BIND, 0) == -1)
- throw SysError(format("unable to bind mount ‘%1%’") % chrootRootDir);
-
- /* Set up a nearly empty /dev, unless the user asked to
- bind-mount the host /dev. */
- Strings ss;
- if (dirsInChroot.find("/dev") == dirsInChroot.end()) {
- createDirs(chrootRootDir + "/dev/shm");
- createDirs(chrootRootDir + "/dev/pts");
- ss.push_back("/dev/full");
-#ifdef __linux__
- if (pathExists("/dev/kvm"))
- ss.push_back("/dev/kvm");
-#endif
- ss.push_back("/dev/null");
- ss.push_back("/dev/random");
- ss.push_back("/dev/tty");
- ss.push_back("/dev/urandom");
- ss.push_back("/dev/zero");
- createSymlink("/proc/self/fd", chrootRootDir + "/dev/fd");
- createSymlink("/proc/self/fd/0", chrootRootDir + "/dev/stdin");
- createSymlink("/proc/self/fd/1", chrootRootDir + "/dev/stdout");
- createSymlink("/proc/self/fd/2", chrootRootDir + "/dev/stderr");
- }
-
- /* Fixed-output derivations typically need to access the
- network, so give them access to /etc/resolv.conf and so
- on. */
- if (fixedOutput) {
- auto files = { "/etc/resolv.conf", "/etc/nsswitch.conf",
- "/etc/services", "/etc/hosts" };
- for (auto & file: files) {
- if (pathExists(file)) ss.push_back(file);
- }
- }
-
- for (auto & i : ss) dirsInChroot[i] = i;
-
- /* Make new mounts for the store and for /tmp. That way, when
- 'chrootRootDir' is made read-only below, these two mounts will
- remain writable (the store needs to be writable so derivation
- outputs can be written to it, and /tmp is writable by
- convention). */
- auto chrootStoreDir = chrootRootDir + settings.nixStore;
- if (mount(chrootStoreDir.c_str(), chrootStoreDir.c_str(), 0, MS_BIND, 0) == -1)
- throw SysError(format("read-write mount of store '%1%' failed") % chrootStoreDir);
- auto chrootTmpDir = chrootRootDir + "/tmp";
- if (mount(chrootTmpDir.c_str(), chrootTmpDir.c_str(), 0, MS_BIND, 0) == -1)
- throw SysError(format("read-write mount of temporary directory '%1%' failed") % chrootTmpDir);
-
- /* Bind-mount all the directories from the "host"
- filesystem that we want in the chroot
- environment. */
- for (auto& i : dirsInChroot) {
- struct stat st;
- Path source = i.second;
- Path target = chrootRootDir + i.first;
- if (source == "/proc") continue; // backwards compatibility
- if (stat(source.c_str(), &st) == -1)
- throw SysError(format("getting attributes of path `%1%'") % source);
- if (S_ISDIR(st.st_mode))
- createDirs(target);
- else {
- createDirs(dirOf(target));
- writeFile(target, "");
- }
-
- /* Extra flags passed with MS_BIND are ignored, hence the
- extra MS_REMOUNT. */
- if (mount(source.c_str(), target.c_str(), "", MS_BIND, 0) == -1)
- throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target);
- if (source.compare(0, settings.nixStore.length(), settings.nixStore) == 0) {
- if (mount(source.c_str(), target.c_str(), "", MS_BIND | MS_REMOUNT | MS_RDONLY, 0) == -1)
- throw SysError(format("read-only remount of `%1%' failed") % target);
- }
- }
-
- /* Bind a new instance of procfs on /proc to reflect our
- private PID namespace. */
- createDirs(chrootRootDir + "/proc");
- if (mount("none", (chrootRootDir + "/proc").c_str(), "proc", 0, 0) == -1)
- throw SysError("mounting /proc");
-
- /* Mount a new tmpfs on /dev/shm to ensure that whatever
- the builder puts in /dev/shm is cleaned up automatically. */
- if (pathExists("/dev/shm") && mount("none", (chrootRootDir + "/dev/shm").c_str(), "tmpfs", 0, 0) == -1)
- throw SysError("mounting /dev/shm");
-
- /* Mount a new devpts on /dev/pts. Note that this
- requires the kernel to be compiled with
- CONFIG_DEVPTS_MULTIPLE_INSTANCES=y (which is the case
- if /dev/ptx/ptmx exists). */
- if (pathExists("/dev/pts/ptmx") &&
- !pathExists(chrootRootDir + "/dev/ptmx")
- && dirsInChroot.find("/dev/pts") == dirsInChroot.end())
- {
- if (mount("none", (chrootRootDir + "/dev/pts").c_str(), "devpts", 0, "newinstance,mode=0620") == -1)
- throw SysError("mounting /dev/pts");
- createSymlink("/dev/pts/ptmx", chrootRootDir + "/dev/ptmx");
-
- /* Make sure /dev/pts/ptmx is world-writable. With some
- Linux versions, it is created with permissions 0. */
- chmod_(chrootRootDir + "/dev/pts/ptmx", 0666);
- }
-
- /* Do the chroot(). */
- if (chdir(chrootRootDir.c_str()) == -1)
- throw SysError(format("cannot change directory to '%1%'") % chrootRootDir);
-
- if (mkdir("real-root", 0) == -1)
- throw SysError("cannot create real-root directory");
-
- if (pivot_root(".", "real-root") == -1)
- throw SysError(format("cannot pivot old root directory onto '%1%'") % (chrootRootDir + "/real-root"));
-
- if (chroot(".") == -1)
- throw SysError(format("cannot change root directory to '%1%'") % chrootRootDir);
-
- if (umount2("real-root", MNT_DETACH) == -1)
- throw SysError("cannot unmount real root filesystem");
-
- if (rmdir("real-root") == -1)
- throw SysError("cannot remove real-root directory");
-
- /* Make the root read-only.
-
- When build users are disabled, the build process could make it
- world-accessible, but that's OK: since 'chrootRootTop' is *not*
- world-accessible, a world-accessible 'chrootRootDir' cannot be
- used to grant access to the build environment to external
- processes.
-
- Remounting the root as read-only was rejected because it makes
- write access fail with EROFS instead of EACCES, which goes
- against what some test suites expect (Go, Ruby, SCons,
- Shepherd, to name a few). */
- chmod_("/", 0555);
-
- if (getuid() != 0) {
- /* Create a new mount namespace to "lock" previous mounts.
- See mount_namespaces(7). */
- auto uid = getuid();
- auto gid = getgid();
-
- if (unshare(CLONE_NEWNS | CLONE_NEWUSER) == -1)
- throw SysError(format("creating new user and mount namespaces"));
-
- initializeUserNamespace(getpid(), uid, gid);
-
- /* Check that mounts within the build environment are "locked"
- together and cannot be separated from within the build
- environment namespace. Since
- umount(2) is documented to fail with EINVAL when attempting
- to unmount one of the mounts that are locked together,
- check that this is what we get. */
- int ret = umount(tmpDirInSandbox.c_str());
- assert(ret == -1 && errno == EINVAL);
- }
- }
-#endif
-
- if (chdir(tmpDirInSandbox.c_str()) == -1)
- throw SysError(format("changing into `%1%'") % tmpDir);
-
- /* Close all other file descriptors. */
- closeMostFDs(set<int>());
-
-#if __linux__
- /* Change the personality to 32-bit if we're doing an
- i686-linux build on an x86_64-linux machine. */
- struct utsname utsbuf;
- uname(&utsbuf);
- if (drv.platform == "i686-linux" &&
- (settings.thisSystem == "x86_64-linux" ||
- (!strcmp(utsbuf.sysname, "Linux") && !strcmp(utsbuf.machine, "x86_64")))) {
- if (personality(PER_LINUX32) == -1)
- throw SysError("cannot set i686-linux personality");
- }
-
- if (drv.platform == "armhf-linux" &&
- (settings.thisSystem == "aarch64-linux" ||
- (!strcmp(utsbuf.sysname, "Linux") && !strcmp(utsbuf.machine, "aarch64")))) {
- if (personality(PER_LINUX32) == -1)
- throw SysError("cannot set armhf-linux personality");
- }
-
- /* Impersonate a Linux 2.6 machine to get some determinism in
- builds that depend on the kernel version. */
- if ((drv.platform == "i686-linux" || drv.platform == "x86_64-linux") && settings.impersonateLinux26) {
- int cur = personality(0xffffffff);
- if (cur != -1) personality(cur | 0x0020000 /* == UNAME26 */);
- }
-
- /* Disable address space randomization for improved
- determinism. */
- int cur = personality(0xffffffff);
- if (cur != -1) personality(cur | ADDR_NO_RANDOMIZE);
-
- /* Ask the kernel to eagerly kill us & our children if it runs out of
- memory, regardless of blame, to preserve ‘real’ user data & state. */
- try {
- writeFile("/proc/self/oom_score_adj", "1000"); // 100%
- } catch (...) { ignoreException(); }
-#endif
-
- /* Fill in the environment. */
- Strings envStrs;
- for (const auto& i : env)
- envStrs.push_back(rewriteHashes(i.first + "=" + i.second, rewritesToTmp));
-
- /* If we are running in `build-users' mode, then switch to the
- user we allocated above. Make sure that we drop all root
- privileges. Note that above we have closed all file
- descriptors except std*, so that's safe. Also note that
- setuid() when run as root sets the real, effective and
- saved UIDs. */
- if (buildUser.enabled()) {
- /* Preserve supplementary groups of the build user, to allow
- admins to specify groups such as "kvm". */
- if (setgroups(buildUser.getSupplementaryGIDs().size(),
- buildUser.getSupplementaryGIDs().data()) == -1)
- throw SysError("cannot set supplementary groups of build user");
-
- if (setgid(buildUser.getGID()) == -1 ||
- getgid() != buildUser.getGID() ||
- getegid() != buildUser.getGID())
- throw SysError("setgid failed");
-
- if (setuid(buildUser.getUID()) == -1 ||
- getuid() != buildUser.getUID() ||
- geteuid() != buildUser.getUID())
- throw SysError("setuid failed");
- }
-
- restoreSIGPIPE();
-
- /* Indicate that we managed to set up the build environment. */
- writeFull(STDERR_FILENO, "\n");
-
- /* Execute the program. This should not return. */
- string builderBasename;
- if (isBuiltin(drv)) {
- try {
- logType = ltFlat;
-
- auto buildDrv = lookupBuiltinBuilder(drv.builder);
- if (buildDrv != NULL) {
- /* Check what the output file name is. When doing a
- 'bmCheck' build, the output file name is different from
- that specified in DRV due to hash rewriting. */
- Path output = drv.outputs["out"].path;
- auto redirected = redirectedOutputs.find(output);
- if (redirected != redirectedOutputs.end())
- output = redirected->second;
-
- buildDrv(drv, drvPath, output);
- }
- else
- throw Error(format("unsupported builtin function '%1%'") % string(drv.builder, 8));
- _exit(0);
- } catch (std::exception & e) {
- writeFull(STDERR_FILENO, "error: " + string(e.what()) + "\n");
- _exit(1);
- }
- } else {
- /* Ensure that the builder is within the store. This prevents
- users from using /proc/self/exe (or a symlink to it) as their
- builder, which could allow them to overwrite the guix-daemon
- binary (CVE-2019-5736).
-
- This attack is possible even if the target of /proc/self/exe is
- outside the chroot (it's as if it were a hard link), though it
- requires that its ELF interpreter and dependencies be in the
- chroot.
-
- Note: 'canonPath' throws if 'drv.builder' cannot be resolved
- within the chroot. */
- builderBasename = baseNameOf(drv.builder);
- drv.builder = canonPath(drv.builder, true);
-
- if (!isInStore(drv.builder))
- throw Error(format("derivation builder '%1%' is outside the store") % drv.builder);
- }
-
- /* Fill in the arguments. */
- Strings args;
- args.push_back(builderBasename);
- for (auto& i : drv.args)
- args.push_back(rewriteHashes(i, rewritesToTmp));
-
- /* If DRV targets the same operating system kernel, try to execute it:
- there might be binfmt_misc set up for user-land emulation of other
- architectures. However, if it targets a different operating
- system--e.g., "i586-gnu" vs. "x86_64-linux"--do not try executing
- it: the ELF file for that OS is likely indistinguishable from a
- native ELF binary and it would just crash at run time. */
- int error;
- if (sameOperatingSystemKernel(drv.platform, settings.thisSystem)) {
- execve(drv.builder.c_str(), stringsToCharPtrs(args).data(),
- stringsToCharPtrs(envStrs).data());
- error = errno;
- } else {
- error = ENOEXEC;
- }
-
- /* Right platform? Check this after we've tried 'execve' to allow for
- transparent emulation of different platforms with binfmt_misc
- handlers that invoke QEMU. */
- if (error == ENOEXEC && !canBuildLocally(drv.platform)) {
- if (settings.printBuildTrace)
- printMsg(lvlError, format("@ unsupported-platform %1% %2%") % drvPath % drv.platform);
- throw Error(
- format("a `%1%' is required to build `%3%', but I am a `%2%'")
- % drv.platform % settings.thisSystem % drvPath);
- }
-
- errno = error;
- throw SysError(format("executing `%1%'") % drv.builder);
-
- } catch (std::exception & e) {
- writeFull(STDERR_FILENO, "while setting up the build environment: " + string(e.what()) + "\n");
- _exit(1);
+ % drvPath % drv.platform % logFile % pid);
}
- abort(); /* never reached */
}
diff --git a/nix/libutil/spawn.cc b/nix/libutil/spawn.cc
new file mode 100644
index 0000000000..93bab9f59e
--- /dev/null
+++ b/nix/libutil/spawn.cc
@@ -0,0 +1,829 @@
+/* GNU Guix --- Functional package management for GNU
+ Copyright (C) 2025 Caleb Ristvedt <reepca@russelstein.xyz>
+
+ This file is part of GNU Guix.
+
+ GNU Guix is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or (at
+ your option) any later version.
+
+ GNU Guix is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. */
+
+/* Process spawning and setup code. */
+
+#include <spawn.hh>
+#include <util.hh>
+#include <affinity.hh>
+#include <stddef.h>
+#include <unistd.h>
+#include <grp.h>
+#include <limits.h>
+#include <sys/wait.h>
+#include <cstring>
+#include <cstdlib>
+
+#if HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#if HAVE_SCHED_H
+#include <sched.h>
+#endif
+
+#if HAVE_STATVFS
+#include <sys/statvfs.h>
+#endif
+
+#if HAVE_SYS_SYSCALL_H
+#include <sys/syscall.h>
+#endif
+
+#if HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#ifdef __linux__
+#include <sys/personality.h>
+#endif
+
+#if defined(SYS_pivot_root)
+#define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root,put_old))
+#endif
+
+
+#define CLONE_ENABLED defined(CLONE_NEWNS)
+
+#if CLONE_ENABLED
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#endif
+
+namespace nix {
+
+
+void addPhaseAfter(Phases & phases, string afterLabel, string addLabel, Action addAction)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == afterLabel) {
+ i++; /* std::vector::insert inserts before, not after */
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.insert(i, p);
+ return;
+ }
+ throw Error(format("label `%1%' not found in phases") % afterLabel);
+}
+
+
+void addPhaseBefore(Phases & phases, string beforeLabel, string addLabel, Action addAction)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == beforeLabel) {
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.insert(i, p);
+ return;
+ }
+ throw Error(format("label `%1%' not found in phases") % beforeLabel);
+}
+
+
+void prependPhase(Phases & phases, string addLabel, Action addAction)
+{
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.insert(phases.begin(), p);
+}
+
+
+void appendPhase(Phases & phases, string addLabel, Action addAction)
+{
+ Phase p;
+ p.label = addLabel;
+ p.action = addAction;
+ phases.push_back(p);
+}
+
+
+void deletePhase(Phases & phases, string delLabel)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == delLabel) {
+ phases.erase(i);
+ return;
+ }
+ throw Error(format("label `%1%' not found in phases") % delLabel);
+}
+
+
+void replacePhase(Phases & phases, string replaceLabel, Action newAction)
+{
+ for(auto i = phases.begin(); i != phases.end(); i++)
+ if((*i).label == replaceLabel) {
+ (*i).action = newAction;
+ return;
+ }
+ throw Error(format("label `%1' not found in phases") % replaceLabel);
+}
+
+
+/* A curated selection of predefined actions */
+
+void reset_writeToStderrAction(SpawnContext & ctx)
+{
+ _writeToStderr = 0;
+}
+
+
+void restoreAffinityAction(SpawnContext & ctx)
+{
+ restoreAffinity();
+}
+
+
+void setsidAction(SpawnContext & ctx)
+{
+ /* Puts the current process in a separate session, which implies a
+ separate process group, so it doesn't receive group-directed signals
+ sent at the parent. The new session initially has no controlling
+ terminal, so it also doesn't receive terminal signals and can't open
+ /dev/tty. */
+ if(ctx.setsid && setsid() == (pid_t)-1)
+ throw SysError("creating a new session");
+}
+
+
+void earlyIOSetupAction(SpawnContext & ctx)
+{
+ for(auto i = ctx.earlyCloseFDs.begin(); i != ctx.earlyCloseFDs.end(); i++)
+ if(close(*i) == -1)
+ throw SysError("closing fd");
+
+ if(ctx.logFD != -1) {
+ if(dup2(ctx.logFD, STDOUT_FILENO) == -1)
+ throw SysError("cannot dup2 log fd into stdout fd");
+ if(dup2(ctx.logFD, STDERR_FILENO) == -1)
+ throw SysError("cannot dup2 log fd into stderr fd");
+ }
+
+ if(ctx.setStdin) {
+ if(ctx.stdinFD != -1) {
+ if(dup2(ctx.stdinFD, STDIN_FILENO) == -1)
+ throw SysError("cannot dup2 fd into stdin fd");
+ }
+ else {
+ /* Doesn't make sense for it to be writable, but compatibility... */
+ AutoCloseFD fd = open(ctx.stdinFile.c_str(), O_RDWR);
+ if(fd == -1)
+ throw SysError(format("cannot open `%1%'") % ctx.stdinFile);
+ if(dup2(fd, STDIN_FILENO) == -1)
+ throw SysError("cannot dup2 fd into stdin fd");
+ }
+ }
+}
+
+
+void dropAmbientCapabilitiesAction(SpawnContext & ctx)
+{
+ /* Drop ambient capabilities such as CAP_CHOWN that might have been granted
+ when starting guix-daemon. */
+ if(ctx.dropAmbientCapabilities)
+#if HAVE_SYS_PRCTL_H
+ prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0);
+#else
+ throw Error("dropping ambient capabilities is not supported on this system");
+#endif
+}
+
+
+void chrootAction(SpawnContext & ctx)
+{
+ if(ctx.doChroot)
+#if HAVE_CHROOT
+ if(chroot(ctx.chrootRootDir.c_str()) == -1)
+ throw SysError(format("cannot change root directory to '%1%'") % ctx.chrootRootDir);
+#else
+ throw Error("chroot is not supported on this system");
+#endif
+}
+
+
+void chdirAction(SpawnContext & ctx)
+{
+ if(ctx.setcwd)
+ if(chdir(ctx.cwd.c_str()) == -1)
+ throw SysError(format("changing into `%1%'") % ctx.cwd);
+}
+
+
+void closeMostFDsAction(SpawnContext & ctx)
+{
+ if(ctx.closeMostFDs) closeMostFDs(ctx.preserveFDs);
+ for(auto i = ctx.preserveFDs.begin(); i != ctx.preserveFDs.end(); i++)
+ keepOnExec(*i);
+}
+
+
+void setPersonalityAction(SpawnContext & ctx)
+{
+ if(ctx.setPersona)
+#ifdef __linux__
+ if(personality(ctx.persona) == -1)
+ throw SysError("cannot set personality");
+#else
+ throw Error("setting the personality is not supported on this system");
+#endif
+}
+
+
+void oomSacrificeAction(SpawnContext & ctx)
+{
+#ifdef __linux__
+ if(ctx.oomSacrifice)
+ /* Ask the kernel to eagerly kill us & our children if it runs out of
+ memory, regardless of blame, to preserve ‘real’ user data &
+ state. */
+ try {
+ writeFile("/proc/self/oom_score_adj", "1000"); // 100%
+ } catch(...) { ignoreException(); }
+#endif
+}
+
+
+void setIDsAction(SpawnContext & ctx)
+{
+ if(ctx.setSupplementaryGroups)
+ if(setgroups(ctx.supplementaryGroups.size(),
+ ctx.supplementaryGroups.data()) == -1)
+ throw SysError("cannot set supplementary groups");
+
+ if(ctx.setgid)
+ if(setgid(ctx.group) == -1 ||
+ getgid() != ctx.group ||
+ getegid() != ctx.group)
+ throw SysError("setgid failed");
+
+ if(ctx.setuid)
+ if(setuid(ctx.user) == -1 ||
+ getuid() != ctx.user ||
+ geteuid() != ctx.user)
+ throw SysError("setuid failed");
+}
+
+
+void restoreSIGPIPEAction(SpawnContext & ctx)
+{
+ /* Restore default handling of SIGPIPE, otherwise some programs will
+ randomly say "Broken pipe". */
+ struct sigaction act, oact;
+ act.sa_handler = SIG_DFL;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGPIPE, &act, &oact)) throw SysError("resetting SIGPIPE");
+}
+
+
+void setupSuccessAction(SpawnContext & ctx)
+{
+ if(ctx.signalSetupSuccess)
+ writeFull(STDERR_FILENO, "\n");
+}
+
+
+void execAction(SpawnContext & ctx)
+{
+ Strings envStrs;
+ std::vector<char *> envPtrs;
+ char **env;
+ if(ctx.inheritEnv) {
+ for(auto i = ctx.env.begin(); i != ctx.env.end(); i++)
+ if(setenv(i->first.c_str(), i->second.c_str(), 1) == -1)
+ throw SysError("setenv");
+ env = environ;
+ } else {
+ for(auto i = ctx.env.begin(); i != ctx.env.end(); i++)
+ envStrs.push_back(i->first + "=" + i->second);
+ /* Need to keep the envPtrs vector alive as long as its .data()! */
+ envPtrs = stringsToCharPtrs(envStrs);
+ env = envPtrs.data();
+ }
+ if(execvpe(ctx.program.c_str(), stringsToCharPtrs(ctx.args).data(), env) == -1)
+ throw SysError(format("executing `%1%'") % ctx.program);
+}
+
+
+Phases getBasicSpawnPhases()
+{
+ return { { "reset_writeToStderr", reset_writeToStderrAction },
+ { "restoreAffinity", restoreAffinityAction },
+ { "setsid", setsidAction },
+ { "earlyIOSetup", earlyIOSetupAction },
+ { "dropAmbientCapabilities", dropAmbientCapabilitiesAction },
+ { "chroot", chrootAction },
+ { "chdir", chdirAction },
+ { "closeMostFDs", closeMostFDsAction },
+ { "setPersonality", setPersonalityAction },
+ { "oomSacrifice", oomSacrificeAction },
+ { "setIDs", setIDsAction },
+ { "restoreSIGPIPE", restoreSIGPIPEAction },
+ { "setupSuccess", setupSuccessAction },
+ { "exec", execAction } };
+}
+
+
+void usernsInitSyncAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+ /* Close the earlyCloseFDs before we try reading anything */
+ for(auto i = ctx.earlyCloseFDs.begin(); i != ctx.earlyCloseFDs.end(); i++)
+ if(close(*i) == -1)
+ throw SysError("closing fd");
+ /* Don't try closing them again later */
+ ctx.earlyCloseFDs.clear();
+ /* Wait for the parent process to initialize the UID/GID mapping of
+ our user namespace. */
+ waitForMessage(ctx.setupFD, "go\n");
+ }
+#endif
+}
+
+
+void usernsSetIDsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWUSER) != 0) {
+ /* Note: 'man capabilities' says that a transition from zero to
+ nonzero uids causes capabilities to be lost, but doesn't say what
+ happens when a transition from an unmapped (possibly zero) uid to a
+ nonzero uid happens. */
+ if(ctx.usernsSetuid)
+ /* Since we presumably have CAP_SETUID, this sets the real,
+ effective, saved, and filesystem uids */
+ if(setuid(ctx.usernsUser) != 0)
+ throw SysError("setuid");
+ if(ctx.usernsSetgid)
+ /* Ditto but with gids */
+ if(setgid(ctx.usernsGroup) != 0)
+ throw SysError("setgid");
+ }
+#endif
+}
+
+
+void initLoopbackAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if(((ctx.cloneFlags & CLONE_NEWNET) != 0) && ctx.initLoopback) {
+ AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP));
+ if (fd == -1) throw SysError("cannot open IP socket");
+
+ struct ifreq ifr;
+ strcpy(ifr.ifr_name, "lo");
+ ifr.ifr_flags = IFF_UP | IFF_LOOPBACK | IFF_RUNNING;
+ if (ioctl(fd, SIOCSIFFLAGS, &ifr) == -1)
+ throw SysError("cannot set loopback interface flags");
+
+ fd.close();
+ }
+#endif
+}
+
+
+void setHostAndDomainAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWUTS) != 0) {
+ if (sethostname(ctx.hostname.c_str(),
+ strlen(ctx.hostname.c_str())) == -1)
+ throw SysError("cannot set host name");
+ if (setdomainname(ctx.domainname.c_str(),
+ strlen(ctx.domainname.c_str())) == -1)
+ throw SysError("cannot set domain name");
+ }
+#endif
+}
+
+
+void makeFilesystemsPrivateAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_REC) && defined(MS_PRIVATE)
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0) {
+ if(mount(0, "/", 0, MS_REC|MS_PRIVATE, 0) == -1)
+ throw SysError("unable to make `/' private mount");
+ }
+#endif
+}
+
+
+void makeChrootSeparateFilesystemAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if(((ctx.cloneFlags & CLONE_NEWNS) != 0) && ctx.doChroot) {
+ /* Bind-mount chroot directory to itself, to treat it as a different
+ filesystem from /, as needed for pivot_root. Alternatively, mount
+ a tmpfs on it. */
+ if(ctx.mountTmpfsOnChroot) {
+ if(mount("none", ctx.chrootRootDir.c_str(), "tmpfs", 0, 0) == -1)
+ throw SysError(format("unable to mount tmpfs on `%1%'") % ctx.chrootRootDir);
+ }
+ else {
+ if(mount(ctx.chrootRootDir.c_str(), ctx.chrootRootDir.c_str(), 0, MS_BIND, 0) == -1)
+ throw SysError(format("unable to bind mount ‘%1%’") % ctx.chrootRootDir);
+ }
+ }
+#endif
+}
+
+
+static int statfsToMountFlags(int f_flags)
+{
+#if HAVE_SYS_MOUNT_H && HAVE_STATVFS
+ int ret = 0;
+#if defined(ST_RDONLY) && defined(MS_RDONLY)
+ if((f_flags & ST_RDONLY) != 0) ret |= MS_RDONLY;
+#endif
+#if defined(ST_NOSUID) && defined(MS_NOSUID)
+ if((f_flags & ST_NOSUID) != 0) ret |= MS_NOSUID;
+#endif
+#if defined(ST_NODEV) && defined(MS_NODEV)
+ if((f_flags & ST_NODEV) != 0) ret |= MS_NODEV;
+#endif
+#if defined(ST_NOEXEC) && defined(MS_NOEXEC)
+ if((f_flags & ST_NOEXEC) != 0) ret |= MS_NOEXEC;
+#endif
+#if defined(ST_NOATIME) && defined(MS_NOATIME)
+ if((f_flags & ST_NOATIME) != 0) ret |= MS_NOATIME;
+#endif
+#if defined(ST_NODIRATIME) && defined(MS_NODIRATIME)
+ if((f_flags & ST_NODIRATIME) != 0) ret |= MS_NODIRATIME;
+#endif
+#if defined(ST_RELATIME) && defined(MS_RELATIME)
+ if((f_flags & ST_RELATIME) != 0) ret |= MS_RELATIME;
+#endif
+ return ret;
+#else
+ throw Error("statfsToMountFlags not supported on this platform");
+#endif
+}
+
+
+void bindMount(Path source, Path target, bool readOnly)
+{
+#if HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ struct stat st;
+ if (lstat(source.c_str(), &st) == -1)
+ throw SysError(format("getting attributes of path `%1%'") % source);
+
+ if(S_ISDIR(st.st_mode))
+ createDirs(target);
+ else if(S_ISLNK(st.st_mode)) {
+ /* bind-mounts follow symlinks, thus representing their target and not
+ the symlink itself. Create a copy of the symlink instead.*/
+ createDirs(dirOf(target));
+ createSymlink(readLink(source), target);
+ return;
+ }
+ else {
+ createDirs(dirOf(target));
+ writeFile(target, "");
+ }
+
+ /* This may fail with EINVAL unless we specify MS_REC, specifically if we
+ are in an unprivileged mount namespace and not specifying MS_REC would
+ reveal subtrees that had been covered up. */
+ if (mount(source.c_str(), target.c_str(), 0, MS_BIND|MS_REC, 0) == -1)
+ throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target);
+ if(readOnly) {
+#if defined(MS_REMOUNT) && defined(MS_RDONLY)
+ /* Extra flags passed with MS_BIND are ignored, hence the extra
+ MS_REMOUNT. */
+ unsigned long mount_flags = MS_BIND | MS_REMOUNT | MS_RDONLY;
+ /* MS_BIND | MS_REMOUNT sets all mountpoint flags, so we may get EPERM
+ unless we preserve the other flags (for example because it would
+ result in trying to clear the nosuid flag). */
+#if HAVE_STATVFS
+ struct statvfs stvfs;
+ if(statvfs(target.c_str(), &stvfs) == -1)
+ throw SysError(format("statvfs of `%1%'") % target);
+ mount_flags |= statfsToMountFlags(stvfs.f_flag);
+#endif
+
+ if (mount(source.c_str(), target.c_str(), 0, mount_flags, 0) == -1)
+ throw SysError(format("read-only remount of `%1%' failed") % target);
+#else
+ throw Error("remounting read-only is not supported on this platform");
+#endif
+ }
+#endif
+}
+
+
+void mountIntoChroot(std::map<Path, Path> filesInChroot,
+ set<Path> readOnlyFiles,
+ Path chrootRootDir)
+{
+#if HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ for(auto i = filesInChroot.begin(); i != filesInChroot.end(); i++) {
+ Path source = i->second;
+ Path target = chrootRootDir + i->first;
+ bool readOnly = readOnlyFiles.find(i->first) != readOnlyFiles.end();
+ bindMount(source, target, readOnly);
+ }
+#else
+ throw Error("bind mounting not supported on this platform");
+#endif
+}
+
+
+void mountIntoChrootAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H && defined(MS_BIND)
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.doChroot) {
+ mountIntoChroot(ctx.filesInChroot, ctx.readOnlyFilesInChroot, ctx.chrootRootDir);
+ }
+#endif
+}
+
+
+void mountProcAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.mountProc) {
+ Path target = (ctx.doChroot ? ctx.chrootRootDir : "") + "/proc";
+ createDirs(target);
+ if(mount("none", target.c_str(), "proc", 0, 0) == -1)
+ throw SysError(format("mounting `%1%'") % target);
+ }
+#endif
+}
+
+
+void mountDevshmAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.mountDevshm) {
+ Path target = (ctx.doChroot ? ctx.chrootRootDir : "") + "/dev/shm";
+ createDirs(target);
+ if(mount("none", target.c_str(), "tmpfs", 0, 0) == -1)
+ throw SysError(format("mounting `%1%'") % target);
+ }
+#endif
+}
+
+
+void mountDevptsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.maybeMountDevpts) {
+ Path chroot = (ctx.doChroot ? ctx.chrootRootDir : "");
+ Path target = chroot + "/dev/pts";
+ if(pathExists(chroot + "/dev/ptmx")) return;
+ createDirs(target);
+ if(mount("none", target.c_str(), "devpts", 0, "newinstance,mode=0620") == -1)
+ throw SysError(format("mounting `%1%'") % target);
+ createSymlink("/dev/pts/ptmx", chroot + "/dev/ptmx");
+ /* Make sure /dev/pts/ptmx is world-writable. With some Linux
+ versions, it is created with permissions 0. */
+ Path targetPtmx = chroot + "/dev/pts/ptmx";
+ if (chmod(targetPtmx.c_str(), 0666) == -1)
+ throw SysError(format("setting permissions on `%1%'") % targetPtmx);
+ }
+#endif
+}
+
+
+void pivotRootAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if((ctx.cloneFlags & CLONE_NEWNS) != 0 && ctx.doChroot) {
+ if (chdir(ctx.chrootRootDir.c_str()) == -1)
+ throw SysError(format("cannot change directory to '%1%'") % ctx.chrootRootDir);
+
+ if (mkdir("real-root", 0) == -1)
+ throw SysError("cannot create real-root directory");
+
+ if (pivot_root(".", "real-root") == -1)
+ throw SysError(format("cannot pivot old root directory onto '%1%'") % (ctx.chrootRootDir + "/real-root"));
+
+ if (chroot(".") == -1)
+ throw SysError(format("cannot change root directory to '%1%'") % ctx.chrootRootDir);
+
+ if (umount2("real-root", MNT_DETACH) == -1)
+ throw SysError("cannot unmount real root filesystem");
+
+ if (rmdir("real-root") == -1)
+ throw SysError("cannot remove real-root directory");
+ }
+#endif
+}
+
+
+string idMapToIdentityMap(const string & map)
+{
+ std::vector<string> mapLines =
+ tokenizeString<std::vector<string> >(map, "\n");
+ string out;
+
+ for(auto & i : mapLines) {
+ std::vector<string> elements =
+ tokenizeString<std::vector<string> >(i, " ");
+ out.append(elements.at(0) + " " + elements.at(0) + " " + elements.at(2) + "\n");
+ }
+ return out;
+}
+
+
+/* Initializing a user namespace with more than one id mapped requires
+ * capabilities in the *parent* user namespace, which may not even have any
+ * processes in it after unshare is called. So fork a child and have it do
+ * the initialization. */
+void unshareAndInitUserns(int flags, const string & uidMap,
+ const string & gidMap, bool allowSetgroups)
+{
+#if CLONE_ENABLED
+ pid_t pid_ = getpid();
+ string pid = std::to_string(pid_);
+ Pipe toChild;
+ Pipe fromChild;
+ toChild.create();
+ fromChild.create();
+ pid_t child = fork();
+ if(child == -1)
+ throw SysError("creating child process");
+ if(child == 0) {
+ try {
+ toChild.writeSide.close();
+ fromChild.readSide.close();
+ waitForMessage(toChild.readSide, "ready\n");
+ writeFile("/proc/" + pid + "/uid_map", uidMap);
+ writeFile("/proc/" + pid + "/setgroups",
+ allowSetgroups ? "allow" : "deny");
+ writeFile("/proc/" + pid + "/gid_map", gidMap);
+ writeFull(fromChild.writeSide, (unsigned char*)"go\n", 3);
+ } catch(...) {
+ /* Don't unwind the stack in case of exception, halt
+ * immediately. */
+ _exit(1);
+ }
+ _exit(EXIT_SUCCESS);
+ } else {
+ toChild.readSide.close();
+ fromChild.writeSide.close();
+ if(unshare(flags) == -1)
+ throw SysError("unshare");
+ writeFull(toChild.writeSide, (unsigned char*)"ready\n", 6);
+ waitForMessage(fromChild.readSide, "go\n");
+ int status;
+ while(waitpid(child, &status, 0) == -1) {
+ if(errno != EINTR)
+ throw SysError("reaping userns init process");
+ }
+ if(!(WIFEXITED(status) != 0 && WEXITSTATUS(status) == EXIT_SUCCESS))
+ throw Error(format("userns init child exited with status %1%") % WEXITSTATUS(status));
+ }
+#endif
+}
+
+
+void lockMountsAction(SpawnContext & sctx)
+{
+#if CLONE_ENABLED && HAVE_SYS_MOUNT_H
+ CloneSpawnContext & ctx = (CloneSpawnContext &) sctx;
+ if(ctx.lockMounts) {
+ string uidMap;
+ string gidMap;
+ if(ctx.lockMountsMapAll) {
+ string oldUidMap = readFile("/proc/self/uid_map", true);
+ string oldGidMap = readFile("/proc/self/gid_map", true);
+ uidMap = idMapToIdentityMap(oldUidMap);
+ gidMap = idMapToIdentityMap(oldGidMap);
+ } else {
+ string uid = std::to_string(getuid());
+ string gid = std::to_string(getgid());
+ uidMap = uid + " " + uid + " 1";
+ gidMap = gid + " " + gid + " 1";
+ }
+ unshareAndInitUserns(CLONE_NEWNS | CLONE_NEWUSER,
+ uidMap, gidMap, ctx.lockMountsAllowSetgroups);
+ /* Check that mounts inherited in our new mount namespace are "locked"
+ together and cannot be separated from within our mount namespace.
+ Since umount(2) is documented to fail with EINVAL when attempting
+ to unmount one of the mounts that are locked together, check that
+ this is what we get. */
+ int ret = umount("/proc");
+ assert(ret == -1 && errno == EINVAL);
+ }
+#endif
+}
+
+
+Phases getCloneSpawnPhases()
+{
+#if CLONE_ENABLED
+ return { { "reset_writeToStderr", reset_writeToStderrAction },
+ { "usernsInitSync", usernsInitSyncAction },
+ { "usernsSetIDs", usernsSetIDsAction },
+ { "restoreAffinity", restoreAffinityAction },
+ { "setsid", setsidAction },
+ { "earlyIOSetup", earlyIOSetupAction },
+ { "dropAmbientCapabilities", dropAmbientCapabilitiesAction },
+ { "initLoopback", initLoopbackAction },
+ { "setHostAndDomain", setHostAndDomainAction },
+ { "makeFilesystemsPrivate", makeFilesystemsPrivateAction },
+ { "makeChrootSeparateFilesystem", makeChrootSeparateFilesystemAction },
+ { "mountIntoChroot", mountIntoChrootAction },
+ { "mountProc", mountProcAction },
+ { "mountDevshm", mountDevshmAction },
+ { "mountDevpts", mountDevptsAction },
+ { "chroot", pivotRootAction },
+ { "chdir", chdirAction },
+ { "closeMostFDs", closeMostFDsAction },
+ { "setPersonality", setPersonalityAction },
+ { "oomSacrifice", oomSacrificeAction },
+ /* Being put in a user namespace with only the current ids mapped
+ would tend to prevent switching to other ones, but if this
+ comes after setIDs then the per-process "dumpable" flag may be
+ reset, which will cause /proc/self to become root-owned,
+ making /proc/self/uid_map inaccessible. If you need
+ lockMounts to preserve the id mappings, and you have the
+ necessary capabilities in the parent user namespace, set
+ CloneSpawnContext.lockMountsMapAll = true. */
+ { "lockMounts", lockMountsAction },
+ { "setIDs", setIDsAction },
+ { "restoreSIGPIPE", restoreSIGPIPEAction },
+ { "setupSuccess", setupSuccessAction },
+ { "exec", execAction }};
+#else
+ throw Error("clone not supported on this platform");
+#endif
+}
+
+
+void runChildSetup(SpawnContext & ctx)
+{
+ ctx.currentPhase = 0;
+ try {
+ /* Should not return regularly from this */
+ while(true) {
+ ctx.phases.at(ctx.currentPhase).action(ctx);
+ ctx.currentPhase++;
+ }
+ } catch (std::exception & e) {
+ try {
+ writeFull(STDERR_FILENO,
+ "while setting up the child process: " +
+ (ctx.currentPhase < (ssize_t)ctx.phases.size() ?
+ "in phase " + ctx.phases[ctx.currentPhase].label + ": " : "") +
+ string(e.what()) + "\n");
+ } catch (std::exception & e2) {
+ _exit(1);
+ }
+ _exit(1);
+ }
+ abort(); /* Should never be reached */
+}
+
+
+int runChildSetupEntry(void *data)
+{
+ runChildSetup(* (SpawnContext *)data);
+ return 1;
+}
+
+
+int cloneChild(CloneSpawnContext & ctx)
+{
+ char stack[32 * 1024];
+ /* Ensure proper alignment on the stack. On aarch64, it has to be 16
+ bytes. */
+ char *alignedStack = (char *)(((uintptr_t)stack + sizeof(stack) - 8) & ~(uintptr_t)0xf);
+ int ret = clone(runChildSetupEntry, alignedStack, ctx.cloneFlags, (void *) &ctx);
+ if(ret == -1)
+ throw SysError("clone");
+ return ret;
+}
+
+}
diff --git a/nix/libutil/spawn.hh b/nix/libutil/spawn.hh
new file mode 100644
index 0000000000..edc528312d
--- /dev/null
+++ b/nix/libutil/spawn.hh
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <util.hh>
+#include <map>
+#include <stddef.h>
+
+namespace nix {
+struct SpawnContext; /* Forward declaration */
+typedef void (Action)(SpawnContext & ctx);
+
+struct Phase {
+ string label;
+ Action * action;
+};
+
+typedef std::vector<Phase> Phases;
+
+/* Common structure read from / written to by setup phases in a newly-spawned
+ child process. Configure this to determine which per-process or
+ per-thread attributes should be set. */
+struct SpawnContext {
+ ssize_t currentPhase = 0;
+ Phases phases;
+ Strings args; /* Will be passed as-is to execve, does not implicitly add
+ * program basename as argv[0]! */
+ Path program;
+ bool inheritEnv = true; /* True to use the current environment after env
+ * has been applied to it, false to use strictly
+ * env. */
+ std::map<string, string> env;
+ bool setPersona = false;
+ int persona;
+ int logFD = -1; /* -1 to keep stdout and stderr */
+ set<int> earlyCloseFDs; /* Typically for closing inherited unused pipe or
+ * socket ends to prevent hangs when reading or
+ * writing. */
+ bool closeMostFDs = false;
+ set<int> preserveFDs; /* 0, 1, and 2 are always implicitly preserved. */
+ bool setStdin = false;
+ int stdinFD = -1; /* fd or -1 */
+ Path stdinFile; /* used if stdinFD == -1 */
+ bool setuid = false;
+ uid_t user;
+ bool setgid = false;
+ gid_t group;
+ bool setSupplementaryGroups = false;
+ std::vector<gid_t> supplementaryGroups;
+ bool setsid = false;
+ bool oomSacrifice = false; /* Whether to attempt to offer the child
+ * process to the OOM killer if possible. */
+ bool setcwd = false;
+ Path cwd;
+ bool signalSetupSuccess = false; /* Whether the parent is waiting for a
+ * message that setup succeeded. By
+ * default success is signaled by
+ * writing a single newline to stderr. */
+ bool dropAmbientCapabilities = false; /* Whether to drop ambient
+ * capabilities if on a system that
+ * supports them. */
+ bool doChroot = false;
+ Path chrootRootDir;
+ void * extraData; /* Extra user data */
+};
+
+/* Like SpawnContext, but with extra fields for setting up Linux namespaces,
+ as created by clone or unshare. */
+struct CloneSpawnContext : SpawnContext {
+ int cloneFlags = 0;
+ std::map<Path, Path> filesInChroot; /* map from path inside chroot to
+ * path outside of chroot */
+ set<Path> readOnlyFilesInChroot;
+ bool mountTmpfsOnChroot = false; /* req. CLONE_NEWNS and doChroot */
+ bool mountProc = false;
+ bool mountDevshm = false;
+ bool maybeMountDevpts = false; /* Only mounted if /dev/ptmx doesn't exist
+ * after any chroot, if applicable. */
+ bool lockMounts = false; /* Whether to lock mounts by creating a fresh
+ * user and mount namespace, see
+ * mount_namespaces(7). */
+ bool lockMountsMapAll = false; /* Whether to map all currently-mapped
+ users and groups when locking mounts or
+ only the current ones. */
+ bool lockMountsAllowSetgroups = false;
+ int setupFD = -1; /* Used for userns init sync and other stuff */
+ string hostname; /* Requires CLONE_NEWUTS */
+ string domainname; /* Same */
+ bool initLoopback = false; /* Also requires CLONE_NEWNET in cloneFlags */
+ /* These may be used if CLONE_NEWUSER in cloneFlags. These are to be
+ used when an id other than the current uid/gid has been mapped into the
+ child's user namespace, and it now needs to setuid/setgid to an id
+ that is mapped. */
+ bool usernsSetuid = false;
+ uid_t usernsUser;
+ bool usernsSetgid = false;
+ gid_t usernsGroup;
+};
+
+void addPhaseAfter(Phases & phases, string afterLabel, string addLabel, Action addAction);
+
+void addPhaseBefore(Phases & phases, string beforeLabel, string addLabel, Action addAction);
+
+void prependPhase(Phases & phases, string addLabel, Action addAction);
+
+void appendPhase(Phases & phases, string addLabel, Action addAction);
+
+void deletePhase(Phases & phases, string delLabel);
+
+void replacePhase(Phases & phases, string replaceLabel, Action newAction);
+
+Action reset_writeToStderrAction;
+Action restoreAffinityAction;
+Action setsidAction;
+Action earlyIOSetupAction;
+Action dropAmbientCapabilitiesAction;
+Action chrootAction;
+Action chdirAction;
+Action closeMostFDsAction;
+Action setPersonalityAction;
+Action oomSacrificeAction;
+Action setIDsAction;
+Action restoreSIGPIPEAction;
+Action setupSuccessAction;
+Action execAction;
+
+Phases getBasicSpawnPhases();
+
+void bindMount(Path source, Path target, bool readOnly);
+
+void mountIntoChroot(std::map<Path, Path> filesInChroot,
+ set<Path> readOnlyFiles,
+ Path chrootRootDir);
+
+Action usernsInitSyncAction;
+Action usernsSetIDsAction;
+Action initLoopbackAction;
+Action setHostAndDomainAction;
+Action makeFilesystemsPrivateAction;
+Action makeChrootSeparateFilesystemAction;
+Action mountIntoChrootAction;
+Action mountProcAction;
+Action mountDevshmAction;
+Action mountDevptsAction;
+Action pivotRootAction;
+Action lockMountsAction;
+
+Phases getCloneSpawnPhases();
+
+/* Helpers */
+string idMapToIdentityMap(const string & map);
+void unshareAndInitUserns(int flags, const string & uidMap,
+ const string & gidMap, bool allowSetgroups);
+
+/* Run the phases of ctx in order, catching and reporting any exception, and
+ * exiting in all cases. */
+void runChildSetup(SpawnContext & ctx);
+
+/* Helper to call runChildSetup that can be passed to the variant of clone
+ * that expects a callback. */
+int runChildSetupEntry(void *data);
+
+/* Create a new process using clone that will immediately call runChildSetup
+ * with the provided CloneSpawnContext. Return the pid of the new process. */
+int cloneChild(CloneSpawnContext & ctx);
+}
diff --git a/nix/libutil/util.cc b/nix/libutil/util.cc
index c406325cdc..e71e6c170a 100644
--- a/nix/libutil/util.cc
+++ b/nix/libutil/util.cc
@@ -724,6 +724,18 @@ string drainFD(int fd)
}
+/* Wait on FD until MESSAGE has been read. */
+void waitForMessage(int fd, const char *message)
+{
+ size_t size = strlen(message);
+ char str[size] = { '\0' };
+ readFull(fd, (unsigned char*)str, size);
+ if (strncmp(str, message, size) != 0)
+ throw Error(format("did not receive message '%1%' on file descriptor %2%")
+ % message % fd);
+}
+
+
//////////////////////////////////////////////////////////////////////
@@ -1140,6 +1152,13 @@ void closeOnExec(int fd)
throw SysError("setting close-on-exec flag");
}
+void keepOnExec(int fd)
+{
+ int prev;
+ if ((prev = fcntl(fd, F_GETFD, 0)) == -1 ||
+ fcntl(fd, F_SETFD, prev & ~FD_CLOEXEC) == -1)
+ throw SysError("clearing close-on-exec flag");
+}
//////////////////////////////////////////////////////////////////////
diff --git a/nix/libutil/util.hh b/nix/libutil/util.hh
index a07c3be6eb..ab2395e959 100644
--- a/nix/libutil/util.hh
+++ b/nix/libutil/util.hh
@@ -173,6 +173,8 @@ MakeError(EndOfFile, Error)
/* Read a file descriptor until EOF occurs. */
string drainFD(int fd);
+void waitForMessage(int fd, const char *message);
+
/* Automatic cleanup of resources. */
@@ -300,6 +302,9 @@ void closeMostFDs(const set<int> & exceptions);
/* Set the close-on-exec flag for the given file descriptor. */
void closeOnExec(int fd);
+/* Clear the close-on-exec flag for the given file descriptor. */
+void keepOnExec(int fd);
+
/* Common initialisation performed in child processes. */
void commonChildInit(Pipe & logPipe);
diff --git a/nix/local.mk b/nix/local.mk
index 54976a5741..9f21550af2 100644
--- a/nix/local.mk
+++ b/nix/local.mk
@@ -56,7 +56,8 @@ libutil_a_SOURCES = \
%D%/libutil/affinity.cc \
%D%/libutil/serialise.cc \
%D%/libutil/util.cc \
- %D%/libutil/hash.cc
+ %D%/libutil/hash.cc \
+ %D%/libutil/spawn.cc
libutil_headers = \
%D%/libutil/affinity.hh \
@@ -64,7 +65,8 @@ libutil_headers = \
%D%/libutil/serialise.hh \
%D%/libutil/util.hh \
%D%/libutil/archive.hh \
- %D%/libutil/types.hh
+ %D%/libutil/types.hh \
+ %D%/libutil/spawn.hh
libutil_a_CPPFLAGS = \
-I$(top_builddir)/nix \