summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am1
-rwxr-xr-xbuild-aux/extract-syscall-ranges.sh80
-rw-r--r--config-daemon.ac4
-rw-r--r--nix/libstore/build.cc219
-rw-r--r--nix/libutil/seccomp.cc162
-rw-r--r--nix/libutil/seccomp.hh222
-rw-r--r--nix/libutil/spawn.cc36
-rw-r--r--nix/libutil/spawn.hh10
-rw-r--r--nix/local.mk6
9 files changed, 738 insertions, 2 deletions
diff --git a/Makefile.am b/Makefile.am
index a4737fe9d5..8b33734f38 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -746,6 +746,7 @@ EXTRA_DIST += \
build-aux/compile-as-derivation.scm \
build-aux/config.rpath \
build-aux/convert-xref.scm \
+ build-aux/extract-syscall-ranges.sh \
build-aux/generate-authors.scm \
build-aux/git-version-gen \
build-aux/mdate-from-git.scm \
diff --git a/build-aux/extract-syscall-ranges.sh b/build-aux/extract-syscall-ranges.sh
new file mode 100755
index 0000000000..3826fc25fe
--- /dev/null
+++ b/build-aux/extract-syscall-ranges.sh
@@ -0,0 +1,80 @@
+#!/bin/sh
+
+if test "$#" -lt 1 || test "$#" -gt 2
+then
+ echo "Usage: extract-syscall-ranges.sh FILENAME [abiname_regex]"
+ exit 1
+fi
+
+numbers_to_ranges()
+{
+ if ! read number
+ then
+ printf '{}\n'
+ return
+ fi
+ low="$number"
+ high="$number"
+ while true
+ do
+ if read number
+ then
+ if test "$number" -eq "$((high + 1))"
+ then
+ high="$number"
+ else
+ break
+ fi
+ else
+ printf '{ {%d, %d} }\n' "$low" "$high"
+ return
+ fi
+ done
+ printf '{ {%d, %d}' "$low" "$high"
+ low="$number"
+ high="$number"
+ while true
+ do
+ if read number
+ then
+ if test "$number" -eq "$((high + 1))"
+ then
+ high="$number"
+ else
+ printf ', {%d, %d}' "$low" "$high"
+ low="$number"
+ high="$number"
+ fi
+ else
+ printf ', {%d, %d} }\n' "$low" "$high"
+ return
+ fi
+ done
+}
+
+if test "$#" -eq 2
+then
+ abi_regex="$2"
+ getnumbers()
+ {
+ # delete comment lines and space-only lines
+ sed -e '/^[[:space:]]*#/d ; /^[[:space:]]*$/d' |
+ # filter to only include lines with target abi or "common"
+ grep -E "^[0-9]+[[:space:]]+(common|(${abi_regex}))[[:space:]]" |
+ # limit to only syscall number
+ sed -e 's/\([0-9]\+\).*/\1/g'
+ }
+else
+ getnumbers()
+ {
+ # delete comment lines and space-only lines and limit to syscall number
+ sed -e '/^[[:space:]]*#/d ; /^[[:space:]]*$/d ; s/\([0-9]\+\).*/\1/g'
+ }
+fi
+
+getnumbers < "$1" |
+ sort -n |
+ uniq | # Yes, there are duplicate syscall entries...
+ numbers_to_ranges
+
+
diff --git a/config-daemon.ac b/config-daemon.ac
index fe73b893ec..2929664140 100644
--- a/config-daemon.ac
+++ b/config-daemon.ac
@@ -152,6 +152,10 @@ if test "x$guix_build_daemon" = "xyes"; then
AC_PATH_PROG([SLIRP4NETNS], [slirp4netns], [slirp4netns])
AC_DEFINE_UNQUOTED([SLIRP4NETNS], ["$SLIRP4NETNS"],
[Path to the slirp4netns program, if any.])
+
+ dnl needed for inspecting 64-bit system call arguments in seccomp's Berkeley
+ dnl Packet Filter VM, which only directly operates on 32-bit words.
+ AC_C_BIGENDIAN
fi
AM_CONDITIONAL([HAVE_LIBBZ2], [test "x$HAVE_LIBBZ2" = "xyes"])
diff --git a/nix/libstore/build.cc b/nix/libstore/build.cc
index 1a688f3b56..eee3a33a58 100644
--- a/nix/libstore/build.cc
+++ b/nix/libstore/build.cc
@@ -85,6 +85,13 @@
/* This header isn't documented in 'man netdevice', but there doesn't seem to
be any other way to get 'struct in6_ifreq'... */
#include <linux/ipv6.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <seccomp.hh>
+
+/* Set to 1 to debug the seccomp filter. */
+#define DEBUG_SECCOMP_FILTER 0
+
#endif
#endif
@@ -1815,6 +1822,7 @@ static void setupTap(int send_fd_socket, bool ipv6Enabled)
sendFD(send_fd_socket, tapfd);
}
+
struct ChrootBuildSpawnContext : CloneSpawnContext {
bool ipv6Enabled = false;
};
@@ -1933,6 +1941,212 @@ static void remapIdsTo0Action(SpawnContext & sctx)
}
+static std::vector<struct sock_filter> slirpSeccompFilter()
+{
+ std::vector<struct sock_filter> out;
+ struct sock_filter allow = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
+ struct sock_filter deny = BPF_STMT(BPF_RET | BPF_K,
+ /* Could also use
+ * SECCOMP_RET_KILL_THREAD, but this
+ * gives nicer error messages. */
+ SECCOMP_RET_ERRNO | ENOSYS);
+ struct sock_filter silentDeny = BPF_STMT(BPF_RET | BPF_K,
+ SECCOMP_RET_ERRNO | 0);
+
+ /* instructions to check for AF_INET or AF_INET6 in the first argument */
+ std::vector<struct sock_filter> allowInet;
+ seccompMatchu64(allowInet,
+ AF_INET,
+ {allow},
+ offsetof(struct seccomp_data, args[0]));
+ seccompMatchu64(allowInet,
+ AF_INET6,
+ {allow},
+ offsetof(struct seccomp_data, args[0]));
+ /* ... and deny otherwise */
+ std::vector<struct sock_filter> denyNonInet;
+ denyNonInet.insert(denyNonInet.begin(), allowInet.begin(), allowInet.end());
+ denyNonInet.push_back(deny);
+
+ /* ... and silent variant. */
+ std::vector<struct sock_filter> silentDenyNonInet;
+
+ silentDenyNonInet.insert(silentDenyNonInet.begin(), allowInet.begin(), allowInet.end());
+ silentDenyNonInet.push_back(silentDeny);
+
+ /* accumulator <-- data.arch */
+ out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))));
+ /* Deny if non-native arch. This simplifies checks as we can now just use
+ * the __NR_* syscall numbers. */
+ out.push_back(BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
+ AUDIT_ARCH_NATIVE,
+ 1,
+ 0));
+ out.push_back(deny);
+
+ std::vector<Uint32RangeAction> specialCaseActions;
+
+#ifdef __NR_socket
+ Uint32RangeAction socketAction;
+ socketAction.low = __NR_socket;
+ socketAction.high = __NR_socket;
+ socketAction.instructions = denyNonInet;
+ specialCaseActions.push_back(socketAction);
+#endif
+
+#ifdef __NR_socketpair
+ /* socketpair can be used to create unix sockets. Presumably they can't
+ * be re-bound or reconnected to use the abstract unix socket namespace,
+ * since they're already connected, but let's not risk it - slirp4netns
+ * shouldn't have a reason to use any IPC anyway. */
+ Uint32RangeAction socketpairAction;
+ socketpairAction.low = __NR_socketpair;
+ socketpairAction.high = __NR_socketpair;
+ /* The silent variant is necessary for socketpair because slirp4netns
+ unconditionally creates a unix socket using socketpair for using setns
+ to exfiltrate a tapfd, despite not actually needing to do that at all
+ since we pass it the tapfd directly. It will refuse to start if
+ socketpair returns anything but 0, so we have no choice but to do that.
+ The would-be-returned socket fds are never used. */
+ socketpairAction.instructions = silentDenyNonInet;
+ specialCaseActions.push_back(socketpairAction);
+#endif
+
+#ifdef __NR_socketcall
+ /* Some architectures include a system call "socketcall" for multiplexing
+ * all the socket-related calls. This system call only accepts two
+ * arguments: a number to indicate which socket-related system call to
+ * invoke, and a pointer to an array holding the arguments for it.
+ * Seccomp can't inspect the contents of memory, only the raw bits passed
+ * to the kernel, so there's no way to only disallow certain invocations
+ * of a socket-related system call. In the past decade, most linux
+ * architectures which relied on "socketcall" have since added dedicated
+ * system calls (socket, socketpair, connect, etc) that can be used
+ * instead of socketcall, and it was mostly uncommon architectures that
+ * relied on it in the first place, so we should be fine to just block it
+ * outright. */
+ Uint32RangeAction socketcallAction;
+ socketcallAction.low = __NR_socketcall;
+ socketcallAction.high = __NR_socketcall;
+ socketcallAction.instructions = {deny};
+ specialCaseActions.push_back(socketcallAction);
+#endif
+
+ /* Kernels before 4.8 allow a process to bypass seccomp restrictions by
+ * spawning another process to ptrace it and modify a system call after
+ * the seccomp check. */
+ Uint32RangeAction ptraceAction;
+ ptraceAction.low = __NR_ptrace;
+ ptraceAction.high = __NR_ptrace;
+ ptraceAction.instructions = { deny };
+ specialCaseActions.push_back(ptraceAction);
+
+ std::vector<struct sock_filter> specialCases =
+ rangeActionsToFilter(specialCaseActions);
+
+ /* accumulator <-- data.nr */
+ out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))));
+
+ out.insert(out.end(), specialCases.begin(), specialCases.end());
+
+ /* accumulator <-- data.nr again */
+ out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))));
+
+ std::vector<Uint32RangeAction> pinnedSyscallRanges = NATIVE_SYSCALL_RANGES;
+ if(pinnedSyscallRanges.size() != 0) {
+ for(auto & i : pinnedSyscallRanges) {
+ i.instructions.push_back(allow);
+ }
+ std::vector<struct sock_filter> pinnedWhitelist = rangeActionsToFilter(pinnedSyscallRanges);
+ out.insert(out.end(), pinnedWhitelist.begin(), pinnedWhitelist.end());
+ out.push_back(deny);
+ }
+ else {
+ /* Couldn't determine pinned system calls, resort to allowing by
+ * default. */
+ out.push_back(allow);
+ }
+ return out;
+}
+
+
+#if DEBUG_SECCOMP_FILTER
+
+/* Note: limited to only the subset we actually use, makes various
+ * assumptions, not general-purpose. */
+static void writeSeccompFilterDot(std::vector<struct sock_filter> filter, FILE *f)
+{
+ fprintf(f, "digraph filter { \n");
+ for(size_t j = 0; j < filter.size(); j++) {
+ switch(BPF_CLASS(filter[j].code)) {
+ case BPF_LD:
+ fprintf(f, "\"%zu\" [label=\"load into accumulator from offset %u\"];\n",
+ j, filter[j].k);
+ fprintf(f, "\"%zu\" -> \"%zu\";\n", j, j + 1);
+ break;
+ case BPF_JMP:
+ switch(BPF_OP(filter[j].code)) {
+ case BPF_JA:
+ fprintf(f, "\"%zu\" [label=\"unconditional jump\"];\n", j);
+ fprintf(f, "\"%zu\" -> \"%zu\";\n", j, j + filter[j].k + 1);
+ break;
+ case BPF_JEQ:
+ fprintf(f, "\"%zu\" [label=\"jump if accumulator = %u\"];\n", j,
+ filter[j].k);
+ fprintf(f, "\"%zu\" -> \"%zu\" [label=\"true\"];\n", j,
+ j + filter[j].jt + 1);
+ fprintf(f, "\"%zu\" -> \"%zu\" [label=\"false\"];\n", j,
+ j + filter[j].jf + 1);
+ break;
+ case BPF_JGT:
+ fprintf(f, "\"%zu\" [label=\"jump if accumulator > %u\"];\n", j,
+ filter[j].k);
+ fprintf(f, "\"%zu\" -> \"%zu\" [label=\"true\"];\n", j,
+ j + filter[j].jt + 1);
+ fprintf(f, "\"%zu\" -> \"%zu\" [label=\"false\"];\n", j,
+ j + filter[j].jf + 1);
+ break;
+ case BPF_JGE:
+ fprintf(f, "\"%zu\" [label=\"jump if accumulator >= %u\"];\n", j,
+ filter[j].k);
+ fprintf(f, "\"%zu\" -> \"%zu\" [label=\"true\"];\n", j,
+ j + filter[j].jt + 1);
+ fprintf(f, "\"%zu\" -> \"%zu\" [label=\"false\"];\n", j,
+ j + filter[j].jf + 1);
+ break;
+ default:
+ fprintf(stderr, "unrecognized jump operation at %zu: %d\n", j, BPF_OP(filter[j].code));
+ }
+ break;
+ case BPF_RET:
+ switch(filter[j].k & SECCOMP_RET_ACTION_FULL) {
+ case SECCOMP_RET_KILL_PROCESS:
+ fprintf(f, "\"%zu\" [label=\"kill the process\"];\n", j);
+ break;
+ case SECCOMP_RET_KILL_THREAD:
+ fprintf(f, "\"%zu\" [label=\"kill the thread\"];\n", j);
+ break;
+ case SECCOMP_RET_ERRNO:
+ fprintf(f, "\"%zu\" [label=\"return errno for \\\"%s\\\"\"];\n",
+ j, strerror(filter[j].k & SECCOMP_RET_DATA));
+ break;
+ case SECCOMP_RET_ALLOW:
+ fprintf(f, "\"%zu\" [label=\"allow system call\"];\n", j);
+ break;
+ default:
+ fprintf(stderr, "unrecognized return operation at %zu: %d\n", j, filter[j].k);
+ break;
+ }
+ break;
+ default:
+ fprintf(stderr, "unrecognized bpf class at %zu: %d\n", j, BPF_CLASS(filter[j].code));
+ }
+ }
+ fprintf(f, "}\n");
+}
+
+#endif
+
/* Spawn 'slirp4netns' in separate namespaces as the given user and group;
'tapfd' must correspond to a /dev/net/tun connection. Configure it to
write to 'notifyReadyFD' once it's up and running. */
@@ -2016,6 +2230,11 @@ static pid_t spawnSlirp4netns(int tapfd, int notifyReadyFD,
slirpCtx.logFD = devNullFd;
}
+#if DEBUG_SECCOMP_FILTER
+ writeSeccompFilterDot(slirpCtx.seccompFilter, stderr);
+ fflush(stderr);
+#endif
+
addPhaseAfter(slirpCtx.phases,
"makeChrootSeparateFilesystem",
"prepareSlirpChroot",
diff --git a/nix/libutil/seccomp.cc b/nix/libutil/seccomp.cc
new file mode 100644
index 0000000000..585442d70b
--- /dev/null
+++ b/nix/libutil/seccomp.cc
@@ -0,0 +1,162 @@
+#if __linux__
+#include <util.hh>
+#include <seccomp.hh>
+#include <algorithm>
+
+namespace nix {
+
+struct FilterInstruction {
+ struct sock_filter instruction;
+ bool fallthroughJt = false;
+ bool fallthroughJf = false;
+ bool fallthroughK = false;
+};
+
+/* Note: instructions in "out" should have already verified that sysno is
+ * >= ranges[lowIndex].low. The value to compare against should already be
+ * in the accumulator. */
+static void
+rangeActionsToFilter(std::vector<Uint32RangeAction> & ranges,
+ size_t lowIndex, /* Inclusive */
+ size_t end, /* Exclusive */
+ std::vector<FilterInstruction> & out)
+{
+ if(lowIndex >= end) return;
+
+ if(end == lowIndex + 1) {
+ FilterInstruction branch;
+ Uint32RangeAction range = ranges.at(lowIndex);
+ branch.instruction = BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K,
+ range.high,
+ /* To be fixed up */
+ 0,
+ 0);
+ branch.fallthroughJt = true;
+ out.push_back(branch);
+ for(auto & i : range.instructions) {
+ FilterInstruction f;
+ f.instruction = i;
+ out.push_back(f);
+ }
+ FilterInstruction fallthroughBranch;
+ fallthroughBranch.instruction = BPF_JUMP(BPF_JMP | BPF_JA | BPF_K,
+ /* To be fixed up */
+ 0,
+ 0,
+ 0);
+ fallthroughBranch.fallthroughK = true;
+ out.push_back(fallthroughBranch);
+ return;
+ }
+
+ size_t middle = lowIndex + ((end - lowIndex) / 2);
+ Uint32RangeAction range = ranges.at(middle);
+ FilterInstruction branch;
+ size_t branchIndex = out.size();
+ branch.instruction = BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K,
+ range.low,
+ 0,
+ /* To be fixed up a little farther down */
+ 0);
+ out.push_back(branch);
+ rangeActionsToFilter(ranges, middle, end, out);
+ size_t elseIndex = out.size();
+ out[branchIndex].instruction.jf = (elseIndex - branchIndex - 1);
+ rangeActionsToFilter(ranges, lowIndex, middle, out);
+}
+
+
+static bool compareRanges(Uint32RangeAction a, Uint32RangeAction b)
+{
+ return (a.low < b.low);
+}
+
+
+/* Produce a loop-unrolled binary search of RANGES for the u32 currently in
+ * the accumulator. If the binary search finds a range that contains it, it
+ * will execute the corresponding instructions. If these instructions fall
+ * through, or if no containing range is found, control resumes after the last
+ * instruction in the returned sequence. */
+std::vector<struct sock_filter>
+rangeActionsToFilter(std::vector<Uint32RangeAction> & ranges)
+{
+ if(ranges.size() == 0) return {};
+ std::sort(ranges.begin(), ranges.end(), compareRanges);
+ if(ranges.size() > 1) {
+ for(auto & i : ranges)
+ if(i.low > i.high)
+ throw Error("Invalid range in rangeActionsToFilter");
+ for(size_t j = 1; j < ranges.size(); j++)
+ if(ranges[j].low <= ranges[j - 1].high)
+ throw Error("Overlapping ranges in rangeActionsToFilter");
+ }
+ std::vector<FilterInstruction> out;
+ Uint32RangeAction first = ranges.at(0);
+ FilterInstruction branch;
+ /* Verify accumulator value is >= first.low, to satisfy initial invariant */
+ branch.instruction = BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K,
+ first.low,
+ 0,
+ /* to be fixed up */
+ 0);
+ branch.fallthroughJf = true;
+ out.push_back(branch);
+ rangeActionsToFilter(ranges, 0, ranges.size(), out);
+ size_t fallthrough = out.size();
+ std::vector<struct sock_filter> out2;
+ for(size_t j = 0; j < out.size(); j++) {
+ if(out[j].fallthroughJt) out[j].instruction.jt = (fallthrough - j - 1);
+ if(out[j].fallthroughJf) out[j].instruction.jf = (fallthrough - j - 1);
+ if(out[j].fallthroughK) out[j].instruction.k = (fallthrough - j - 1);
+ out2.push_back(out[j].instruction);
+ }
+ return out2;
+}
+
+
+/* If the uint64 at offset OFFSET has value VALUE, run INSTRUCTIONS.
+ * Otherwise, or if INSTRUCTIONS falls through, continue past the last
+ * instruction of OUT at the time seccompMatchu64 returns. Clobbers
+ * accumulator! */
+std::vector<struct sock_filter> seccompMatchu64(std::vector<struct sock_filter> & out,
+ uint64_t value,
+ std::vector<struct sock_filter> instructions,
+ uint32_t offset)
+{
+ /* Note: this only works where the order of bytes in uint64 is big or
+ * little endian, and the same order holds for uint32. */
+ /* Load lower-addressed 32 bits */
+ out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offset));
+ size_t jmp1Index = out.size();
+
+ out.push_back(BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
+#ifdef WORDS_BIGENDIAN
+ (uint32_t)((value >> 32) & 0xffffffff),
+#else
+ (uint32_t)(value & 0xffffffff),
+#endif
+ 0,
+ /* To be fixed up */
+ 0));
+ /* Load higher-addressed 32 bits */
+ out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offset + (uint32_t)sizeof(uint32_t)));
+ size_t jmp2Index = out.size();
+ out.push_back(BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
+#ifdef WORDS_BIGENDIAN
+ (uint32_t)(value & 0xffffffff),
+#else
+ (uint32_t)((value >> 32) & 0xffffffff),
+#endif
+ 0,
+ /* To be fixed up */
+ 0));
+
+ out.insert(out.end(), instructions.begin(), instructions.end());
+ out[jmp1Index].jf = (out.size() - jmp1Index - 1);
+ out[jmp2Index].jf = (out.size() - jmp2Index - 1);
+ return out;
+}
+
+}
+
+#endif
diff --git a/nix/libutil/seccomp.hh b/nix/libutil/seccomp.hh
new file mode 100644
index 0000000000..634dfad5f8
--- /dev/null
+++ b/nix/libutil/seccomp.hh
@@ -0,0 +1,222 @@
+#pragma once
+
+#include "util.hh"
+#include <linux/audit.h> /* For AUDIT_ARCH_* */
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+
+
+/* This file provides two preprocessor macros (among other things):
+ 1. AUDIT_ARCH_NATIVE, which evaluates to whichever of the AUDIT_ARCH_*
+ values best represents the target system. Linux's internal headers have
+ a SECCOMP_ARCH_NATIVE since 2020, but it's not user-visible. Detection
+ of this is based on src/arch.c in libseccomp.
+ 2. NATIVE_SYSCALL_RANGES, an array initializer for an array of two-element
+ objects, the first of which is an integral number representing the
+ start (inclusive) of a range of valid syscall numbers, and the second
+ of which is an integral number representing the end (inclusive) of that
+ range of valid syscall numbers. The ranges provided are all
+ non-overlapping and strictly ascending (that is, the start of a range is
+ strictly higher than any of the numbers in any of the ranges that
+ precede it). All numbers involved fit into a long.
+
+ These ranges were generated from the various syscall.tbl,
+ syscall_32.tbl, and syscall_64.tbl files lying around in the linux
+ kernel source. Some were derived from
+ include/uapi/asm-generic/unistd.h. The kernel source used was commit
+ b3ee1e460951 of https://github.com/torvalds/linux.git, read on
+ 2025-04-23. Not all of the gaps in the files have any comments pointing
+ them out, so I recommend using build-aux/extract-syscall-ranges.sh for
+ the *.tbl files.
+
+ The intent behind saving these ranges is to be able to use a
+ default-allow seccomp policy that nevertheless disallows future
+ syscalls. This ensures that our security analysis can work with a
+ static, well-defined set of system calls that won't grow in the future
+ unless someone explicitly revisits the system call tables to consider
+ the implications of the new additions. */
+
+/* Both ends are inclusive. Some of the .tbl files use strange entries for
+ * the "abi" field, check arch/$ARCH/kernel/Makefile.syscalls to see what it
+ * specifies for syscall_abis_32 and syscall_abis_64 in addition to 32 or 64
+ * and "common" (added in Makefile.asm-headers). Also check what, if
+ * anything, the makefile uses as the --offset flag to syscallhdr.sh. And
+ * look at arch/$ARCH/include/uapi/asm/unistd.h to see what value the offset
+ * takes in what configurations. */
+
+#ifndef AUDIT_ARCH_NATIVE
+
+#if __i386__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_I386
+#define NATIVE_SYSCALL_RANGES { {0, 221}, {224, 250}, {252, 284}, {286, 386}, \
+ {393, 414}, {416, 466} }
+#elif __x86_64__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_X86_64
+#ifdef __ILP32__
+#include <asm/unistd.h>
+#define X32RANGE(low, high) { (low | __X32_SYSCALL_BIT), (high | __X32_SYSCALL_BIT) }
+#define NATIVE_SYSCALL_RANGES \
+ { X32RANGE(0, 12), X32RANGE(14, 14), X32RANGE(17, 18), X32RANGE(21, 44), X32RANGE(48, 53), \
+ X32RANGE(56, 58), X32RANGE(60, 100), X32RANGE(102, 126), X32RANGE(130, 130), \
+ X32RANGE(132, 133), X32RANGE(135, 155), X32RANGE(157, 173), X32RANGE(175, 176), \
+ X32RANGE(179, 179), X32RANGE(181, 204), X32RANGE(207, 208), X32RANGE(210, 210), \
+ X32RANGE(212, 213), X32RANGE(216, 221), X32RANGE(223, 235), X32RANGE(237, 243), \
+ X32RANGE(245, 245), X32RANGE(248, 272), X32RANGE(275, 277), X32RANGE(280, 294), \
+ X32RANGE(298, 298), X32RANGE(300, 306), X32RANGE(308, 309), X32RANGE(312, 321), \
+ X32RANGE(323, 326), X32RANGE(329, 335), X32RANGE(424, 466), X32RANGE(512, 547) }
+#else
+#define NATIVE_SYSCALL_RANGES { {0, 335}, {424, 466} }
+#endif
+#elif __arm__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_ARM
+/* Note: there are at present 6 extra ARM syscall numbers not listed in
+ arch/arm/tools/syscall.tbl, namely __ARM_NR_breakpoint through
+ __ARM_NR_get_tls. */
+#ifdef __ARM_EABI__
+#include <asm/unistd.h>
+#define NATIVE_SYSCALL_RANGES \
+ { {0, 6}, {8, 12}, {14, 16}, {19, 21}, {23, 24}, {26, 26}, {29, 29}, \
+ {33, 34}, {36, 43}, {45, 47}, {49, 52}, {54, 55}, {57, 57}, {60, 67}, \
+ {70, 75}, {77, 81}, {83, 83}, {85, 88}, {91, 97}, {99, 100}, {103, 108}, \
+ {111, 111}, {114, 116}, {118, 122}, {124, 126}, {128, 129}, {131, 136}, \
+ {138, 165}, {168, 187}, {190, 221}, {224, 253}, {256, 401}, {403, 414}, \
+ {416, 446}, {448, 466}, {(__ARM_NR_BASE + 1), (__ARM_NR_BASE + 6)} }
+#else
+#include <asm/unistd.h>
+#define OABIRANGE(low, high) { (low | __NR_OABI_SYSCALL_BASE), (high | __NR_OABI_SYSCALL_BASE) }
+#define NATIVE_SYSCALL_RANGES \
+ { OABIRANGE(0, 6), OABIRANGE(8, 16), OABIRANGE(19, 27), OABIRANGE(29, 30), \
+ OABIRANGE(33, 34), OABIRANGE(36, 43), OABIRANGE(45, 47), OABIRANGE(49, 52), \
+ OABIRANGE(54, 55), OABIRANGE(57, 57), OABIRANGE(60, 67), OABIRANGE(70, 83), \
+ OABIRANGE(85, 97), OABIRANGE(99, 100), OABIRANGE(102, 108), \
+ OABIRANGE(111, 111), OABIRANGE(113, 122), OABIRANGE(124, 126), \
+ OABIRANGE(128, 129), OABIRANGE(131, 136), OABIRANGE(138, 165), \
+ OABIRANGE(168, 187), OABIRANGE(190, 221), OABIRANGE(224, 253), \
+ OABIRANGE(256, 401), OABIRANGE(403, 414), OABIRANGE(416, 446), \
+ OABIRANGE(448, 466), {(__ARM_NR_BASE + 1), (__ARM_NR_BASE + 6)} }
+#endif
+#elif __aarch64__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_AARCH64
+/* extract-syscall-ranges.sh $LINUXSOURCE/arch/arm64/tools/syscall_64.tbl \
+ '64|renameat|rlimit|memfd_secret'
+
+ the extra ABIs are taken from arch/arm64/kernel/Makefile.syscalls and
+ scripts/Makefile.asm-headers */
+#define NATIVE_SYSCALL_RANGES { {0, 243}, {260, 294}, {424, 466} }
+/* To my knowledge there is no x32 equivalent for aarch64 in mainline linux */
+#elif __mips__ && _MIPS_SIM == _MIPS_SIM_ABI32
+/* o32 abi in both endianness cases */
+#include <asm/unistd.h>
+#define SYSRANGE(low, high) {(low) + __NR_Linux, (high) + __NR_Linux}
+#define NATIVE_SYSCALL_RANGES \
+ { SYSRANGE(0, 278), SYSRANGE(280, 368), SYSRANGE(393, 414), \
+ SYSRANGE(416, 446), SYSRANGE(448, 466) }
+#if __MIPSEB__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPS;
+#elif __MIPSEL__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPSEL
+#endif
+#elif __mips__ && _MIPS_SIM == _MIPS_SIM_ABI64
+/* n64 abi in both endianness cases */
+#include <asm/unistd.h>
+#define SYSRANGE(low, high) {(low) + __NR_Linux, (high) + __NR_Linux}
+#define NATIVE_SYSCALL_RANGES \
+ { SYSRANGE(0, 237), SYSRANGE(239, 328), SYSRANGE(424, 446), SYSRANGE(448, 466) }
+#if __MIPSEB__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPS64
+#elif __MIPSEL__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPSEL64
+#endif /* _MIPS_SIM_ABI64 */
+#elif __mips__ && _MIPS_SIM == _MIPS_SIM_NABI32
+/* n32 abi in both endianness cases */
+#include <asm/unistd.h>
+#define SYSRANGE(low, high) {(low) + __NR_Linux, (high) + __NR_Linux}
+#define NATIVE_SYSCALL_RANGES \
+ { SYSRANGE(0, 241), SYSRANGE(243, 332), SYSRANGE(403, 414), \
+ SYSRANGE(416, 446), SYSRANGE(448, 466) }
+#if __MIPSEB__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPS64N32
+#elif __MIPSEL__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPSEL64N32
+#endif /* _MIPS_SIM_NABI32 */
+#elif __hppa64__ /* hppa64 must be checked before hppa */
+#define NATIVE_SYSCALL_RANGES \
+ { {0, 101}, {103, 126}, {128, 129}, {131, 136}, {138, 166}, \
+ {168, 168}, {170, 195}, {198, 202}, {206, 212}, {215, 219}, \
+ {222, 262}, {264, 302}, {304, 356}, {424, 446}, {448, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PARISC64
+#elif __hppa__
+#define NATIVE_SYSCALL_RANGES \
+ { {0, 101}, {103, 126}, {128, 129}, {131, 136}, {138, 166}, \
+ {168, 168}, {170, 195}, {198, 202}, {206, 212}, {215, 219}, \
+ {222, 262}, {264, 302}, {304, 356}, {403, 414}, {416, 446}, {448, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PARISC
+#elif __PPC64__
+#define NATIVE_SYSCALL_RANGES \
+ { {0, 191}, {198, 203}, {205, 223}, {225, 225}, {227, 253}, \
+ {255, 256}, {258, 365}, {378, 388}, {392, 402}, {424, 446}, {448, 466} }
+#ifdef __BIG_ENDIAN__
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PPC64
+#else
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PPC64LE
+#endif
+#elif __PPC__
+#define NATIVE_SYSCALL_RANGES \
+ { {0, 223}, {225, 256}, {258, 365}, {378, 388}, {393, 414}, \
+ {416, 446}, {448, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PPC
+#elif __s390x__ /* s390x must be checked before s390 */
+#define NATIVE_SYSCALL_RANGES \
+ { {1, 12}, {14, 15}, {19, 22}, {26, 27}, {29, 30}, {33, 34}, \
+ {36, 43}, {45, 45}, {48, 48}, {51, 52}, {54, 55}, {57, 57}, \
+ {60, 67}, {72, 75}, {77, 79}, {83, 83}, {85, 94}, {96, 97}, \
+ {99, 100}, {102, 108}, {110, 112}, {114, 122}, {124, 137}, \
+ {141, 163}, {167, 169}, {172, 181}, {183, 191}, {198, 220}, \
+ {222, 222}, {224, 241}, {243, 262}, {265, 386}, {392, 402}, {424, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_S390X
+#elif __s390__
+#define NATIVE_SYSCALL_RANGES \
+ { {1, 16}, {19, 27}, {29, 30}, {33, 34}, {36, 43}, {45, 52}, \
+ {54, 55}, {57, 57}, {60, 67}, {70, 81}, {83, 83}, {85, 97}, \
+ {99, 108}, {110, 112}, {114, 122}, {124, 165}, {167, 241}, \
+ {243, 262}, {264, 386}, {393, 414}, {416, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_S390
+#elif __riscv && __riscv_xlen == 64
+#define NATIVE_SYSCALL_RANGES { {0, 37}, {39, 243}, {258, 294}, {424, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_RISCV64
+#elif __riscv && __riscv_xlen == 32
+#define NATIVE_SYSCALL_RANGES \
+ { {0, 3}, {5, 37}, {39, 71}, {74, 78}, {81, 85}, {89, 97}, {99, 100}, \
+ {102, 107}, {109, 109}, {111, 111}, {116, 126}, {128, 136}, {138, 162}, \
+ {165, 168}, {172, 181}, {184, 191}, {193, 242}, {258, 259}, {261, 265}, \
+ {267, 291}, {293, 294}, {403, 414}, {416, 466} }
+#define AUDIT_ARCH_NATIVE AUDIT_ARCH_RISCV32
+#else
+#error cannot determine which AUDIT_ARCH_* value to use for AUDIT_ARCH_NATIVE
+#endif
+
+#else
+#ifndef NATIVE_SYSCALL_RANGES
+/* Fall back to default-allow if the user specified (with
+ -DAUDIT_ARCH_NATIVE=...) an arch but not NATIVE_SYSCALL_RANGES */
+#define NATIVE_SYSCALL_RANGES {}
+#endif
+#endif /* #ifndef AUDIT_ARCH_NATIVE */
+
+namespace nix {
+
+struct Uint32RangeAction {
+ uint32_t low; /* inclusive */
+ uint32_t high; /* inclusive */
+ std::vector<struct sock_filter> instructions;
+};
+
+std::vector<struct sock_filter> rangeActionsToFilter(std::vector<Uint32RangeAction> & ranges);
+
+
+std::vector<struct sock_filter>
+seccompMatchu64(std::vector<struct sock_filter> & out,
+ uint64_t value,
+ std::vector<struct sock_filter> instructions,
+ uint32_t offset);
+}
diff --git a/nix/libutil/spawn.cc b/nix/libutil/spawn.cc
index 93bab9f59e..414849b6f0 100644
--- a/nix/libutil/spawn.cc
+++ b/nix/libutil/spawn.cc
@@ -51,6 +51,8 @@
#ifdef __linux__
#include <sys/personality.h>
+#include <linux/seccomp.h>
+#include <linux/filter.h>
#endif
#if defined(SYS_pivot_root)
@@ -281,6 +283,36 @@ void setIDsAction(SpawnContext & ctx)
throw SysError("setuid failed");
}
+void setNoNewPrivsAction(SpawnContext & ctx)
+{
+ if(ctx.setNoNewPrivs)
+#if __linux__ && defined(PR_SET_NO_NEW_PRIVS)
+ if(prctl(PR_SET_NO_NEW_PRIVS, 0, 0, 0, 0) == -1)
+ throw SysError("setting PR_SET_NO_NEW_PRIVS");
+#else
+ throw Error("setting PR_SET_NO_NEW_PRIVS not supported on this system");
+#endif
+}
+
+void addSeccompFilterAction(SpawnContext & ctx)
+{
+ if(ctx.addSeccompFilter) {
+#if __linux__ && defined(PR_SET_SECCOMP) && defined(SECCOMP_MODE_FILTER)
+ /* We use no extra functionality from the seccomp system call, so
+ * just use prctl. */
+ if(ctx.seccompFilter.size() > USHRT_MAX)
+ throw Error("seccomp filter too large");
+ struct sock_fprog prog;
+ prog.len = (unsigned short) ctx.seccompFilter.size();
+ prog.filter = ctx.seccompFilter.data();
+ if(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
+ throw SysError("installing seccomp filter");
+#else
+ throw Error("setting seccomp filter not supported on this system");
+#endif
+ }
+}
+
void restoreSIGPIPEAction(SpawnContext & ctx)
{
@@ -336,6 +368,8 @@ Phases getBasicSpawnPhases()
{ "setPersonality", setPersonalityAction },
{ "oomSacrifice", oomSacrificeAction },
{ "setIDs", setIDsAction },
+ { "setNoNewPrivs", setNoNewPrivsAction },
+ { "addSeccompFilter", addSeccompFilterAction },
{ "restoreSIGPIPE", restoreSIGPIPEAction },
{ "setupSuccess", setupSuccessAction },
{ "exec", execAction } };
@@ -773,6 +807,8 @@ Phases getCloneSpawnPhases()
CloneSpawnContext.lockMountsMapAll = true. */
{ "lockMounts", lockMountsAction },
{ "setIDs", setIDsAction },
+ { "setNoNewPrivs", setNoNewPrivsAction },
+ { "addSeccompFilter", addSeccompFilterAction },
{ "restoreSIGPIPE", restoreSIGPIPEAction },
{ "setupSuccess", setupSuccessAction },
{ "exec", execAction }};
diff --git a/nix/libutil/spawn.hh b/nix/libutil/spawn.hh
index edc528312d..5e75bcfb09 100644
--- a/nix/libutil/spawn.hh
+++ b/nix/libutil/spawn.hh
@@ -3,6 +3,9 @@
#include <util.hh>
#include <map>
#include <stddef.h>
+#ifdef __linux__
+#include <linux/filter.h>
+#endif
namespace nix {
struct SpawnContext; /* Forward declaration */
@@ -57,6 +60,11 @@ struct SpawnContext {
bool dropAmbientCapabilities = false; /* Whether to drop ambient
* capabilities if on a system that
* supports them. */
+ bool setNoNewPrivs = false;
+ bool addSeccompFilter = false;
+#if __linux__
+ std::vector<struct sock_filter> seccompFilter;
+#endif
bool doChroot = false;
Path chrootRootDir;
void * extraData; /* Extra user data */
@@ -118,6 +126,8 @@ Action closeMostFDsAction;
Action setPersonalityAction;
Action oomSacrificeAction;
Action setIDsAction;
+Action setNoNewPrivsAction;
+Action addSeccompFilterAction;
Action restoreSIGPIPEAction;
Action setupSuccessAction;
Action execAction;
diff --git a/nix/local.mk b/nix/local.mk
index 9f21550af2..7c1b81e9a6 100644
--- a/nix/local.mk
+++ b/nix/local.mk
@@ -57,7 +57,8 @@ libutil_a_SOURCES = \
%D%/libutil/serialise.cc \
%D%/libutil/util.cc \
%D%/libutil/hash.cc \
- %D%/libutil/spawn.cc
+ %D%/libutil/spawn.cc \
+ %D%/libutil/seccomp.cc
libutil_headers = \
%D%/libutil/affinity.hh \
@@ -66,7 +67,8 @@ libutil_headers = \
%D%/libutil/util.hh \
%D%/libutil/archive.hh \
%D%/libutil/types.hh \
- %D%/libutil/spawn.hh
+ %D%/libutil/spawn.hh \
+ %D%/libutil/seccomp.hh
libutil_a_CPPFLAGS = \
-I$(top_builddir)/nix \