diff options
-rw-r--r-- | Makefile.am | 1 | ||||
-rwxr-xr-x | build-aux/extract-syscall-ranges.sh | 80 | ||||
-rw-r--r-- | config-daemon.ac | 4 | ||||
-rw-r--r-- | nix/libstore/build.cc | 219 | ||||
-rw-r--r-- | nix/libutil/seccomp.cc | 162 | ||||
-rw-r--r-- | nix/libutil/seccomp.hh | 222 | ||||
-rw-r--r-- | nix/libutil/spawn.cc | 36 | ||||
-rw-r--r-- | nix/libutil/spawn.hh | 10 | ||||
-rw-r--r-- | nix/local.mk | 6 |
9 files changed, 738 insertions, 2 deletions
diff --git a/Makefile.am b/Makefile.am index a4737fe9d5..8b33734f38 100644 --- a/Makefile.am +++ b/Makefile.am @@ -746,6 +746,7 @@ EXTRA_DIST += \ build-aux/compile-as-derivation.scm \ build-aux/config.rpath \ build-aux/convert-xref.scm \ + build-aux/extract-syscall-ranges.sh \ build-aux/generate-authors.scm \ build-aux/git-version-gen \ build-aux/mdate-from-git.scm \ diff --git a/build-aux/extract-syscall-ranges.sh b/build-aux/extract-syscall-ranges.sh new file mode 100755 index 0000000000..3826fc25fe --- /dev/null +++ b/build-aux/extract-syscall-ranges.sh @@ -0,0 +1,80 @@ +#!/bin/sh + +if test "$#" -lt 1 || test "$#" -gt 2 +then + echo "Usage: extract-syscall-ranges.sh FILENAME [abiname_regex]" + exit 1 +fi + +numbers_to_ranges() +{ + if ! read number + then + printf '{}\n' + return + fi + low="$number" + high="$number" + while true + do + if read number + then + if test "$number" -eq "$((high + 1))" + then + high="$number" + else + break + fi + else + printf '{ {%d, %d} }\n' "$low" "$high" + return + fi + done + printf '{ {%d, %d}' "$low" "$high" + low="$number" + high="$number" + while true + do + if read number + then + if test "$number" -eq "$((high + 1))" + then + high="$number" + else + printf ', {%d, %d}' "$low" "$high" + low="$number" + high="$number" + fi + else + printf ', {%d, %d} }\n' "$low" "$high" + return + fi + done +} + +if test "$#" -eq 2 +then + abi_regex="$2" + getnumbers() + { + # delete comment lines and space-only lines + sed -e '/^[[:space:]]*#/d ; /^[[:space:]]*$/d' | + # filter to only include lines with target abi or "common" + grep -E "^[0-9]+[[:space:]]+(common|(${abi_regex}))[[:space:]]" | + # limit to only syscall number + sed -e 's/\([0-9]\+\).*/\1/g' + } +else + getnumbers() + { + # delete comment lines and space-only lines and limit to syscall number + sed -e '/^[[:space:]]*#/d ; /^[[:space:]]*$/d ; s/\([0-9]\+\).*/\1/g' + } +fi + +getnumbers < "$1" | + sort -n | + uniq | # Yes, there are duplicate syscall entries... + numbers_to_ranges + + diff --git a/config-daemon.ac b/config-daemon.ac index fe73b893ec..2929664140 100644 --- a/config-daemon.ac +++ b/config-daemon.ac @@ -152,6 +152,10 @@ if test "x$guix_build_daemon" = "xyes"; then AC_PATH_PROG([SLIRP4NETNS], [slirp4netns], [slirp4netns]) AC_DEFINE_UNQUOTED([SLIRP4NETNS], ["$SLIRP4NETNS"], [Path to the slirp4netns program, if any.]) + + dnl needed for inspecting 64-bit system call arguments in seccomp's Berkeley + dnl Packet Filter VM, which only directly operates on 32-bit words. + AC_C_BIGENDIAN fi AM_CONDITIONAL([HAVE_LIBBZ2], [test "x$HAVE_LIBBZ2" = "xyes"]) diff --git a/nix/libstore/build.cc b/nix/libstore/build.cc index 1a688f3b56..eee3a33a58 100644 --- a/nix/libstore/build.cc +++ b/nix/libstore/build.cc @@ -85,6 +85,13 @@ /* This header isn't documented in 'man netdevice', but there doesn't seem to be any other way to get 'struct in6_ifreq'... */ #include <linux/ipv6.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <seccomp.hh> + +/* Set to 1 to debug the seccomp filter. */ +#define DEBUG_SECCOMP_FILTER 0 + #endif #endif @@ -1815,6 +1822,7 @@ static void setupTap(int send_fd_socket, bool ipv6Enabled) sendFD(send_fd_socket, tapfd); } + struct ChrootBuildSpawnContext : CloneSpawnContext { bool ipv6Enabled = false; }; @@ -1933,6 +1941,212 @@ static void remapIdsTo0Action(SpawnContext & sctx) } +static std::vector<struct sock_filter> slirpSeccompFilter() +{ + std::vector<struct sock_filter> out; + struct sock_filter allow = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW); + struct sock_filter deny = BPF_STMT(BPF_RET | BPF_K, + /* Could also use + * SECCOMP_RET_KILL_THREAD, but this + * gives nicer error messages. */ + SECCOMP_RET_ERRNO | ENOSYS); + struct sock_filter silentDeny = BPF_STMT(BPF_RET | BPF_K, + SECCOMP_RET_ERRNO | 0); + + /* instructions to check for AF_INET or AF_INET6 in the first argument */ + std::vector<struct sock_filter> allowInet; + seccompMatchu64(allowInet, + AF_INET, + {allow}, + offsetof(struct seccomp_data, args[0])); + seccompMatchu64(allowInet, + AF_INET6, + {allow}, + offsetof(struct seccomp_data, args[0])); + /* ... and deny otherwise */ + std::vector<struct sock_filter> denyNonInet; + denyNonInet.insert(denyNonInet.begin(), allowInet.begin(), allowInet.end()); + denyNonInet.push_back(deny); + + /* ... and silent variant. */ + std::vector<struct sock_filter> silentDenyNonInet; + + silentDenyNonInet.insert(silentDenyNonInet.begin(), allowInet.begin(), allowInet.end()); + silentDenyNonInet.push_back(silentDeny); + + /* accumulator <-- data.arch */ + out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch)))); + /* Deny if non-native arch. This simplifies checks as we can now just use + * the __NR_* syscall numbers. */ + out.push_back(BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, + AUDIT_ARCH_NATIVE, + 1, + 0)); + out.push_back(deny); + + std::vector<Uint32RangeAction> specialCaseActions; + +#ifdef __NR_socket + Uint32RangeAction socketAction; + socketAction.low = __NR_socket; + socketAction.high = __NR_socket; + socketAction.instructions = denyNonInet; + specialCaseActions.push_back(socketAction); +#endif + +#ifdef __NR_socketpair + /* socketpair can be used to create unix sockets. Presumably they can't + * be re-bound or reconnected to use the abstract unix socket namespace, + * since they're already connected, but let's not risk it - slirp4netns + * shouldn't have a reason to use any IPC anyway. */ + Uint32RangeAction socketpairAction; + socketpairAction.low = __NR_socketpair; + socketpairAction.high = __NR_socketpair; + /* The silent variant is necessary for socketpair because slirp4netns + unconditionally creates a unix socket using socketpair for using setns + to exfiltrate a tapfd, despite not actually needing to do that at all + since we pass it the tapfd directly. It will refuse to start if + socketpair returns anything but 0, so we have no choice but to do that. + The would-be-returned socket fds are never used. */ + socketpairAction.instructions = silentDenyNonInet; + specialCaseActions.push_back(socketpairAction); +#endif + +#ifdef __NR_socketcall + /* Some architectures include a system call "socketcall" for multiplexing + * all the socket-related calls. This system call only accepts two + * arguments: a number to indicate which socket-related system call to + * invoke, and a pointer to an array holding the arguments for it. + * Seccomp can't inspect the contents of memory, only the raw bits passed + * to the kernel, so there's no way to only disallow certain invocations + * of a socket-related system call. In the past decade, most linux + * architectures which relied on "socketcall" have since added dedicated + * system calls (socket, socketpair, connect, etc) that can be used + * instead of socketcall, and it was mostly uncommon architectures that + * relied on it in the first place, so we should be fine to just block it + * outright. */ + Uint32RangeAction socketcallAction; + socketcallAction.low = __NR_socketcall; + socketcallAction.high = __NR_socketcall; + socketcallAction.instructions = {deny}; + specialCaseActions.push_back(socketcallAction); +#endif + + /* Kernels before 4.8 allow a process to bypass seccomp restrictions by + * spawning another process to ptrace it and modify a system call after + * the seccomp check. */ + Uint32RangeAction ptraceAction; + ptraceAction.low = __NR_ptrace; + ptraceAction.high = __NR_ptrace; + ptraceAction.instructions = { deny }; + specialCaseActions.push_back(ptraceAction); + + std::vector<struct sock_filter> specialCases = + rangeActionsToFilter(specialCaseActions); + + /* accumulator <-- data.nr */ + out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr)))); + + out.insert(out.end(), specialCases.begin(), specialCases.end()); + + /* accumulator <-- data.nr again */ + out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr)))); + + std::vector<Uint32RangeAction> pinnedSyscallRanges = NATIVE_SYSCALL_RANGES; + if(pinnedSyscallRanges.size() != 0) { + for(auto & i : pinnedSyscallRanges) { + i.instructions.push_back(allow); + } + std::vector<struct sock_filter> pinnedWhitelist = rangeActionsToFilter(pinnedSyscallRanges); + out.insert(out.end(), pinnedWhitelist.begin(), pinnedWhitelist.end()); + out.push_back(deny); + } + else { + /* Couldn't determine pinned system calls, resort to allowing by + * default. */ + out.push_back(allow); + } + return out; +} + + +#if DEBUG_SECCOMP_FILTER + +/* Note: limited to only the subset we actually use, makes various + * assumptions, not general-purpose. */ +static void writeSeccompFilterDot(std::vector<struct sock_filter> filter, FILE *f) +{ + fprintf(f, "digraph filter { \n"); + for(size_t j = 0; j < filter.size(); j++) { + switch(BPF_CLASS(filter[j].code)) { + case BPF_LD: + fprintf(f, "\"%zu\" [label=\"load into accumulator from offset %u\"];\n", + j, filter[j].k); + fprintf(f, "\"%zu\" -> \"%zu\";\n", j, j + 1); + break; + case BPF_JMP: + switch(BPF_OP(filter[j].code)) { + case BPF_JA: + fprintf(f, "\"%zu\" [label=\"unconditional jump\"];\n", j); + fprintf(f, "\"%zu\" -> \"%zu\";\n", j, j + filter[j].k + 1); + break; + case BPF_JEQ: + fprintf(f, "\"%zu\" [label=\"jump if accumulator = %u\"];\n", j, + filter[j].k); + fprintf(f, "\"%zu\" -> \"%zu\" [label=\"true\"];\n", j, + j + filter[j].jt + 1); + fprintf(f, "\"%zu\" -> \"%zu\" [label=\"false\"];\n", j, + j + filter[j].jf + 1); + break; + case BPF_JGT: + fprintf(f, "\"%zu\" [label=\"jump if accumulator > %u\"];\n", j, + filter[j].k); + fprintf(f, "\"%zu\" -> \"%zu\" [label=\"true\"];\n", j, + j + filter[j].jt + 1); + fprintf(f, "\"%zu\" -> \"%zu\" [label=\"false\"];\n", j, + j + filter[j].jf + 1); + break; + case BPF_JGE: + fprintf(f, "\"%zu\" [label=\"jump if accumulator >= %u\"];\n", j, + filter[j].k); + fprintf(f, "\"%zu\" -> \"%zu\" [label=\"true\"];\n", j, + j + filter[j].jt + 1); + fprintf(f, "\"%zu\" -> \"%zu\" [label=\"false\"];\n", j, + j + filter[j].jf + 1); + break; + default: + fprintf(stderr, "unrecognized jump operation at %zu: %d\n", j, BPF_OP(filter[j].code)); + } + break; + case BPF_RET: + switch(filter[j].k & SECCOMP_RET_ACTION_FULL) { + case SECCOMP_RET_KILL_PROCESS: + fprintf(f, "\"%zu\" [label=\"kill the process\"];\n", j); + break; + case SECCOMP_RET_KILL_THREAD: + fprintf(f, "\"%zu\" [label=\"kill the thread\"];\n", j); + break; + case SECCOMP_RET_ERRNO: + fprintf(f, "\"%zu\" [label=\"return errno for \\\"%s\\\"\"];\n", + j, strerror(filter[j].k & SECCOMP_RET_DATA)); + break; + case SECCOMP_RET_ALLOW: + fprintf(f, "\"%zu\" [label=\"allow system call\"];\n", j); + break; + default: + fprintf(stderr, "unrecognized return operation at %zu: %d\n", j, filter[j].k); + break; + } + break; + default: + fprintf(stderr, "unrecognized bpf class at %zu: %d\n", j, BPF_CLASS(filter[j].code)); + } + } + fprintf(f, "}\n"); +} + +#endif + /* Spawn 'slirp4netns' in separate namespaces as the given user and group; 'tapfd' must correspond to a /dev/net/tun connection. Configure it to write to 'notifyReadyFD' once it's up and running. */ @@ -2016,6 +2230,11 @@ static pid_t spawnSlirp4netns(int tapfd, int notifyReadyFD, slirpCtx.logFD = devNullFd; } +#if DEBUG_SECCOMP_FILTER + writeSeccompFilterDot(slirpCtx.seccompFilter, stderr); + fflush(stderr); +#endif + addPhaseAfter(slirpCtx.phases, "makeChrootSeparateFilesystem", "prepareSlirpChroot", diff --git a/nix/libutil/seccomp.cc b/nix/libutil/seccomp.cc new file mode 100644 index 0000000000..585442d70b --- /dev/null +++ b/nix/libutil/seccomp.cc @@ -0,0 +1,162 @@ +#if __linux__ +#include <util.hh> +#include <seccomp.hh> +#include <algorithm> + +namespace nix { + +struct FilterInstruction { + struct sock_filter instruction; + bool fallthroughJt = false; + bool fallthroughJf = false; + bool fallthroughK = false; +}; + +/* Note: instructions in "out" should have already verified that sysno is + * >= ranges[lowIndex].low. The value to compare against should already be + * in the accumulator. */ +static void +rangeActionsToFilter(std::vector<Uint32RangeAction> & ranges, + size_t lowIndex, /* Inclusive */ + size_t end, /* Exclusive */ + std::vector<FilterInstruction> & out) +{ + if(lowIndex >= end) return; + + if(end == lowIndex + 1) { + FilterInstruction branch; + Uint32RangeAction range = ranges.at(lowIndex); + branch.instruction = BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, + range.high, + /* To be fixed up */ + 0, + 0); + branch.fallthroughJt = true; + out.push_back(branch); + for(auto & i : range.instructions) { + FilterInstruction f; + f.instruction = i; + out.push_back(f); + } + FilterInstruction fallthroughBranch; + fallthroughBranch.instruction = BPF_JUMP(BPF_JMP | BPF_JA | BPF_K, + /* To be fixed up */ + 0, + 0, + 0); + fallthroughBranch.fallthroughK = true; + out.push_back(fallthroughBranch); + return; + } + + size_t middle = lowIndex + ((end - lowIndex) / 2); + Uint32RangeAction range = ranges.at(middle); + FilterInstruction branch; + size_t branchIndex = out.size(); + branch.instruction = BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, + range.low, + 0, + /* To be fixed up a little farther down */ + 0); + out.push_back(branch); + rangeActionsToFilter(ranges, middle, end, out); + size_t elseIndex = out.size(); + out[branchIndex].instruction.jf = (elseIndex - branchIndex - 1); + rangeActionsToFilter(ranges, lowIndex, middle, out); +} + + +static bool compareRanges(Uint32RangeAction a, Uint32RangeAction b) +{ + return (a.low < b.low); +} + + +/* Produce a loop-unrolled binary search of RANGES for the u32 currently in + * the accumulator. If the binary search finds a range that contains it, it + * will execute the corresponding instructions. If these instructions fall + * through, or if no containing range is found, control resumes after the last + * instruction in the returned sequence. */ +std::vector<struct sock_filter> +rangeActionsToFilter(std::vector<Uint32RangeAction> & ranges) +{ + if(ranges.size() == 0) return {}; + std::sort(ranges.begin(), ranges.end(), compareRanges); + if(ranges.size() > 1) { + for(auto & i : ranges) + if(i.low > i.high) + throw Error("Invalid range in rangeActionsToFilter"); + for(size_t j = 1; j < ranges.size(); j++) + if(ranges[j].low <= ranges[j - 1].high) + throw Error("Overlapping ranges in rangeActionsToFilter"); + } + std::vector<FilterInstruction> out; + Uint32RangeAction first = ranges.at(0); + FilterInstruction branch; + /* Verify accumulator value is >= first.low, to satisfy initial invariant */ + branch.instruction = BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, + first.low, + 0, + /* to be fixed up */ + 0); + branch.fallthroughJf = true; + out.push_back(branch); + rangeActionsToFilter(ranges, 0, ranges.size(), out); + size_t fallthrough = out.size(); + std::vector<struct sock_filter> out2; + for(size_t j = 0; j < out.size(); j++) { + if(out[j].fallthroughJt) out[j].instruction.jt = (fallthrough - j - 1); + if(out[j].fallthroughJf) out[j].instruction.jf = (fallthrough - j - 1); + if(out[j].fallthroughK) out[j].instruction.k = (fallthrough - j - 1); + out2.push_back(out[j].instruction); + } + return out2; +} + + +/* If the uint64 at offset OFFSET has value VALUE, run INSTRUCTIONS. + * Otherwise, or if INSTRUCTIONS falls through, continue past the last + * instruction of OUT at the time seccompMatchu64 returns. Clobbers + * accumulator! */ +std::vector<struct sock_filter> seccompMatchu64(std::vector<struct sock_filter> & out, + uint64_t value, + std::vector<struct sock_filter> instructions, + uint32_t offset) +{ + /* Note: this only works where the order of bytes in uint64 is big or + * little endian, and the same order holds for uint32. */ + /* Load lower-addressed 32 bits */ + out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offset)); + size_t jmp1Index = out.size(); + + out.push_back(BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, +#ifdef WORDS_BIGENDIAN + (uint32_t)((value >> 32) & 0xffffffff), +#else + (uint32_t)(value & 0xffffffff), +#endif + 0, + /* To be fixed up */ + 0)); + /* Load higher-addressed 32 bits */ + out.push_back(BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offset + (uint32_t)sizeof(uint32_t))); + size_t jmp2Index = out.size(); + out.push_back(BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, +#ifdef WORDS_BIGENDIAN + (uint32_t)(value & 0xffffffff), +#else + (uint32_t)((value >> 32) & 0xffffffff), +#endif + 0, + /* To be fixed up */ + 0)); + + out.insert(out.end(), instructions.begin(), instructions.end()); + out[jmp1Index].jf = (out.size() - jmp1Index - 1); + out[jmp2Index].jf = (out.size() - jmp2Index - 1); + return out; +} + +} + +#endif diff --git a/nix/libutil/seccomp.hh b/nix/libutil/seccomp.hh new file mode 100644 index 0000000000..634dfad5f8 --- /dev/null +++ b/nix/libutil/seccomp.hh @@ -0,0 +1,222 @@ +#pragma once + +#include "util.hh" +#include <linux/audit.h> /* For AUDIT_ARCH_* */ +#include <linux/seccomp.h> +#include <linux/filter.h> + + +/* This file provides two preprocessor macros (among other things): + 1. AUDIT_ARCH_NATIVE, which evaluates to whichever of the AUDIT_ARCH_* + values best represents the target system. Linux's internal headers have + a SECCOMP_ARCH_NATIVE since 2020, but it's not user-visible. Detection + of this is based on src/arch.c in libseccomp. + 2. NATIVE_SYSCALL_RANGES, an array initializer for an array of two-element + objects, the first of which is an integral number representing the + start (inclusive) of a range of valid syscall numbers, and the second + of which is an integral number representing the end (inclusive) of that + range of valid syscall numbers. The ranges provided are all + non-overlapping and strictly ascending (that is, the start of a range is + strictly higher than any of the numbers in any of the ranges that + precede it). All numbers involved fit into a long. + + These ranges were generated from the various syscall.tbl, + syscall_32.tbl, and syscall_64.tbl files lying around in the linux + kernel source. Some were derived from + include/uapi/asm-generic/unistd.h. The kernel source used was commit + b3ee1e460951 of https://github.com/torvalds/linux.git, read on + 2025-04-23. Not all of the gaps in the files have any comments pointing + them out, so I recommend using build-aux/extract-syscall-ranges.sh for + the *.tbl files. + + The intent behind saving these ranges is to be able to use a + default-allow seccomp policy that nevertheless disallows future + syscalls. This ensures that our security analysis can work with a + static, well-defined set of system calls that won't grow in the future + unless someone explicitly revisits the system call tables to consider + the implications of the new additions. */ + +/* Both ends are inclusive. Some of the .tbl files use strange entries for + * the "abi" field, check arch/$ARCH/kernel/Makefile.syscalls to see what it + * specifies for syscall_abis_32 and syscall_abis_64 in addition to 32 or 64 + * and "common" (added in Makefile.asm-headers). Also check what, if + * anything, the makefile uses as the --offset flag to syscallhdr.sh. And + * look at arch/$ARCH/include/uapi/asm/unistd.h to see what value the offset + * takes in what configurations. */ + +#ifndef AUDIT_ARCH_NATIVE + +#if __i386__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_I386 +#define NATIVE_SYSCALL_RANGES { {0, 221}, {224, 250}, {252, 284}, {286, 386}, \ + {393, 414}, {416, 466} } +#elif __x86_64__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_X86_64 +#ifdef __ILP32__ +#include <asm/unistd.h> +#define X32RANGE(low, high) { (low | __X32_SYSCALL_BIT), (high | __X32_SYSCALL_BIT) } +#define NATIVE_SYSCALL_RANGES \ + { X32RANGE(0, 12), X32RANGE(14, 14), X32RANGE(17, 18), X32RANGE(21, 44), X32RANGE(48, 53), \ + X32RANGE(56, 58), X32RANGE(60, 100), X32RANGE(102, 126), X32RANGE(130, 130), \ + X32RANGE(132, 133), X32RANGE(135, 155), X32RANGE(157, 173), X32RANGE(175, 176), \ + X32RANGE(179, 179), X32RANGE(181, 204), X32RANGE(207, 208), X32RANGE(210, 210), \ + X32RANGE(212, 213), X32RANGE(216, 221), X32RANGE(223, 235), X32RANGE(237, 243), \ + X32RANGE(245, 245), X32RANGE(248, 272), X32RANGE(275, 277), X32RANGE(280, 294), \ + X32RANGE(298, 298), X32RANGE(300, 306), X32RANGE(308, 309), X32RANGE(312, 321), \ + X32RANGE(323, 326), X32RANGE(329, 335), X32RANGE(424, 466), X32RANGE(512, 547) } +#else +#define NATIVE_SYSCALL_RANGES { {0, 335}, {424, 466} } +#endif +#elif __arm__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_ARM +/* Note: there are at present 6 extra ARM syscall numbers not listed in + arch/arm/tools/syscall.tbl, namely __ARM_NR_breakpoint through + __ARM_NR_get_tls. */ +#ifdef __ARM_EABI__ +#include <asm/unistd.h> +#define NATIVE_SYSCALL_RANGES \ + { {0, 6}, {8, 12}, {14, 16}, {19, 21}, {23, 24}, {26, 26}, {29, 29}, \ + {33, 34}, {36, 43}, {45, 47}, {49, 52}, {54, 55}, {57, 57}, {60, 67}, \ + {70, 75}, {77, 81}, {83, 83}, {85, 88}, {91, 97}, {99, 100}, {103, 108}, \ + {111, 111}, {114, 116}, {118, 122}, {124, 126}, {128, 129}, {131, 136}, \ + {138, 165}, {168, 187}, {190, 221}, {224, 253}, {256, 401}, {403, 414}, \ + {416, 446}, {448, 466}, {(__ARM_NR_BASE + 1), (__ARM_NR_BASE + 6)} } +#else +#include <asm/unistd.h> +#define OABIRANGE(low, high) { (low | __NR_OABI_SYSCALL_BASE), (high | __NR_OABI_SYSCALL_BASE) } +#define NATIVE_SYSCALL_RANGES \ + { OABIRANGE(0, 6), OABIRANGE(8, 16), OABIRANGE(19, 27), OABIRANGE(29, 30), \ + OABIRANGE(33, 34), OABIRANGE(36, 43), OABIRANGE(45, 47), OABIRANGE(49, 52), \ + OABIRANGE(54, 55), OABIRANGE(57, 57), OABIRANGE(60, 67), OABIRANGE(70, 83), \ + OABIRANGE(85, 97), OABIRANGE(99, 100), OABIRANGE(102, 108), \ + OABIRANGE(111, 111), OABIRANGE(113, 122), OABIRANGE(124, 126), \ + OABIRANGE(128, 129), OABIRANGE(131, 136), OABIRANGE(138, 165), \ + OABIRANGE(168, 187), OABIRANGE(190, 221), OABIRANGE(224, 253), \ + OABIRANGE(256, 401), OABIRANGE(403, 414), OABIRANGE(416, 446), \ + OABIRANGE(448, 466), {(__ARM_NR_BASE + 1), (__ARM_NR_BASE + 6)} } +#endif +#elif __aarch64__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_AARCH64 +/* extract-syscall-ranges.sh $LINUXSOURCE/arch/arm64/tools/syscall_64.tbl \ + '64|renameat|rlimit|memfd_secret' + + the extra ABIs are taken from arch/arm64/kernel/Makefile.syscalls and + scripts/Makefile.asm-headers */ +#define NATIVE_SYSCALL_RANGES { {0, 243}, {260, 294}, {424, 466} } +/* To my knowledge there is no x32 equivalent for aarch64 in mainline linux */ +#elif __mips__ && _MIPS_SIM == _MIPS_SIM_ABI32 +/* o32 abi in both endianness cases */ +#include <asm/unistd.h> +#define SYSRANGE(low, high) {(low) + __NR_Linux, (high) + __NR_Linux} +#define NATIVE_SYSCALL_RANGES \ + { SYSRANGE(0, 278), SYSRANGE(280, 368), SYSRANGE(393, 414), \ + SYSRANGE(416, 446), SYSRANGE(448, 466) } +#if __MIPSEB__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPS; +#elif __MIPSEL__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPSEL +#endif +#elif __mips__ && _MIPS_SIM == _MIPS_SIM_ABI64 +/* n64 abi in both endianness cases */ +#include <asm/unistd.h> +#define SYSRANGE(low, high) {(low) + __NR_Linux, (high) + __NR_Linux} +#define NATIVE_SYSCALL_RANGES \ + { SYSRANGE(0, 237), SYSRANGE(239, 328), SYSRANGE(424, 446), SYSRANGE(448, 466) } +#if __MIPSEB__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPS64 +#elif __MIPSEL__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPSEL64 +#endif /* _MIPS_SIM_ABI64 */ +#elif __mips__ && _MIPS_SIM == _MIPS_SIM_NABI32 +/* n32 abi in both endianness cases */ +#include <asm/unistd.h> +#define SYSRANGE(low, high) {(low) + __NR_Linux, (high) + __NR_Linux} +#define NATIVE_SYSCALL_RANGES \ + { SYSRANGE(0, 241), SYSRANGE(243, 332), SYSRANGE(403, 414), \ + SYSRANGE(416, 446), SYSRANGE(448, 466) } +#if __MIPSEB__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPS64N32 +#elif __MIPSEL__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_MIPSEL64N32 +#endif /* _MIPS_SIM_NABI32 */ +#elif __hppa64__ /* hppa64 must be checked before hppa */ +#define NATIVE_SYSCALL_RANGES \ + { {0, 101}, {103, 126}, {128, 129}, {131, 136}, {138, 166}, \ + {168, 168}, {170, 195}, {198, 202}, {206, 212}, {215, 219}, \ + {222, 262}, {264, 302}, {304, 356}, {424, 446}, {448, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PARISC64 +#elif __hppa__ +#define NATIVE_SYSCALL_RANGES \ + { {0, 101}, {103, 126}, {128, 129}, {131, 136}, {138, 166}, \ + {168, 168}, {170, 195}, {198, 202}, {206, 212}, {215, 219}, \ + {222, 262}, {264, 302}, {304, 356}, {403, 414}, {416, 446}, {448, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PARISC +#elif __PPC64__ +#define NATIVE_SYSCALL_RANGES \ + { {0, 191}, {198, 203}, {205, 223}, {225, 225}, {227, 253}, \ + {255, 256}, {258, 365}, {378, 388}, {392, 402}, {424, 446}, {448, 466} } +#ifdef __BIG_ENDIAN__ +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PPC64 +#else +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PPC64LE +#endif +#elif __PPC__ +#define NATIVE_SYSCALL_RANGES \ + { {0, 223}, {225, 256}, {258, 365}, {378, 388}, {393, 414}, \ + {416, 446}, {448, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_PPC +#elif __s390x__ /* s390x must be checked before s390 */ +#define NATIVE_SYSCALL_RANGES \ + { {1, 12}, {14, 15}, {19, 22}, {26, 27}, {29, 30}, {33, 34}, \ + {36, 43}, {45, 45}, {48, 48}, {51, 52}, {54, 55}, {57, 57}, \ + {60, 67}, {72, 75}, {77, 79}, {83, 83}, {85, 94}, {96, 97}, \ + {99, 100}, {102, 108}, {110, 112}, {114, 122}, {124, 137}, \ + {141, 163}, {167, 169}, {172, 181}, {183, 191}, {198, 220}, \ + {222, 222}, {224, 241}, {243, 262}, {265, 386}, {392, 402}, {424, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_S390X +#elif __s390__ +#define NATIVE_SYSCALL_RANGES \ + { {1, 16}, {19, 27}, {29, 30}, {33, 34}, {36, 43}, {45, 52}, \ + {54, 55}, {57, 57}, {60, 67}, {70, 81}, {83, 83}, {85, 97}, \ + {99, 108}, {110, 112}, {114, 122}, {124, 165}, {167, 241}, \ + {243, 262}, {264, 386}, {393, 414}, {416, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_S390 +#elif __riscv && __riscv_xlen == 64 +#define NATIVE_SYSCALL_RANGES { {0, 37}, {39, 243}, {258, 294}, {424, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_RISCV64 +#elif __riscv && __riscv_xlen == 32 +#define NATIVE_SYSCALL_RANGES \ + { {0, 3}, {5, 37}, {39, 71}, {74, 78}, {81, 85}, {89, 97}, {99, 100}, \ + {102, 107}, {109, 109}, {111, 111}, {116, 126}, {128, 136}, {138, 162}, \ + {165, 168}, {172, 181}, {184, 191}, {193, 242}, {258, 259}, {261, 265}, \ + {267, 291}, {293, 294}, {403, 414}, {416, 466} } +#define AUDIT_ARCH_NATIVE AUDIT_ARCH_RISCV32 +#else +#error cannot determine which AUDIT_ARCH_* value to use for AUDIT_ARCH_NATIVE +#endif + +#else +#ifndef NATIVE_SYSCALL_RANGES +/* Fall back to default-allow if the user specified (with + -DAUDIT_ARCH_NATIVE=...) an arch but not NATIVE_SYSCALL_RANGES */ +#define NATIVE_SYSCALL_RANGES {} +#endif +#endif /* #ifndef AUDIT_ARCH_NATIVE */ + +namespace nix { + +struct Uint32RangeAction { + uint32_t low; /* inclusive */ + uint32_t high; /* inclusive */ + std::vector<struct sock_filter> instructions; +}; + +std::vector<struct sock_filter> rangeActionsToFilter(std::vector<Uint32RangeAction> & ranges); + + +std::vector<struct sock_filter> +seccompMatchu64(std::vector<struct sock_filter> & out, + uint64_t value, + std::vector<struct sock_filter> instructions, + uint32_t offset); +} diff --git a/nix/libutil/spawn.cc b/nix/libutil/spawn.cc index 93bab9f59e..414849b6f0 100644 --- a/nix/libutil/spawn.cc +++ b/nix/libutil/spawn.cc @@ -51,6 +51,8 @@ #ifdef __linux__ #include <sys/personality.h> +#include <linux/seccomp.h> +#include <linux/filter.h> #endif #if defined(SYS_pivot_root) @@ -281,6 +283,36 @@ void setIDsAction(SpawnContext & ctx) throw SysError("setuid failed"); } +void setNoNewPrivsAction(SpawnContext & ctx) +{ + if(ctx.setNoNewPrivs) +#if __linux__ && defined(PR_SET_NO_NEW_PRIVS) + if(prctl(PR_SET_NO_NEW_PRIVS, 0, 0, 0, 0) == -1) + throw SysError("setting PR_SET_NO_NEW_PRIVS"); +#else + throw Error("setting PR_SET_NO_NEW_PRIVS not supported on this system"); +#endif +} + +void addSeccompFilterAction(SpawnContext & ctx) +{ + if(ctx.addSeccompFilter) { +#if __linux__ && defined(PR_SET_SECCOMP) && defined(SECCOMP_MODE_FILTER) + /* We use no extra functionality from the seccomp system call, so + * just use prctl. */ + if(ctx.seccompFilter.size() > USHRT_MAX) + throw Error("seccomp filter too large"); + struct sock_fprog prog; + prog.len = (unsigned short) ctx.seccompFilter.size(); + prog.filter = ctx.seccompFilter.data(); + if(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) + throw SysError("installing seccomp filter"); +#else + throw Error("setting seccomp filter not supported on this system"); +#endif + } +} + void restoreSIGPIPEAction(SpawnContext & ctx) { @@ -336,6 +368,8 @@ Phases getBasicSpawnPhases() { "setPersonality", setPersonalityAction }, { "oomSacrifice", oomSacrificeAction }, { "setIDs", setIDsAction }, + { "setNoNewPrivs", setNoNewPrivsAction }, + { "addSeccompFilter", addSeccompFilterAction }, { "restoreSIGPIPE", restoreSIGPIPEAction }, { "setupSuccess", setupSuccessAction }, { "exec", execAction } }; @@ -773,6 +807,8 @@ Phases getCloneSpawnPhases() CloneSpawnContext.lockMountsMapAll = true. */ { "lockMounts", lockMountsAction }, { "setIDs", setIDsAction }, + { "setNoNewPrivs", setNoNewPrivsAction }, + { "addSeccompFilter", addSeccompFilterAction }, { "restoreSIGPIPE", restoreSIGPIPEAction }, { "setupSuccess", setupSuccessAction }, { "exec", execAction }}; diff --git a/nix/libutil/spawn.hh b/nix/libutil/spawn.hh index edc528312d..5e75bcfb09 100644 --- a/nix/libutil/spawn.hh +++ b/nix/libutil/spawn.hh @@ -3,6 +3,9 @@ #include <util.hh> #include <map> #include <stddef.h> +#ifdef __linux__ +#include <linux/filter.h> +#endif namespace nix { struct SpawnContext; /* Forward declaration */ @@ -57,6 +60,11 @@ struct SpawnContext { bool dropAmbientCapabilities = false; /* Whether to drop ambient * capabilities if on a system that * supports them. */ + bool setNoNewPrivs = false; + bool addSeccompFilter = false; +#if __linux__ + std::vector<struct sock_filter> seccompFilter; +#endif bool doChroot = false; Path chrootRootDir; void * extraData; /* Extra user data */ @@ -118,6 +126,8 @@ Action closeMostFDsAction; Action setPersonalityAction; Action oomSacrificeAction; Action setIDsAction; +Action setNoNewPrivsAction; +Action addSeccompFilterAction; Action restoreSIGPIPEAction; Action setupSuccessAction; Action execAction; diff --git a/nix/local.mk b/nix/local.mk index 9f21550af2..7c1b81e9a6 100644 --- a/nix/local.mk +++ b/nix/local.mk @@ -57,7 +57,8 @@ libutil_a_SOURCES = \ %D%/libutil/serialise.cc \ %D%/libutil/util.cc \ %D%/libutil/hash.cc \ - %D%/libutil/spawn.cc + %D%/libutil/spawn.cc \ + %D%/libutil/seccomp.cc libutil_headers = \ %D%/libutil/affinity.hh \ @@ -66,7 +67,8 @@ libutil_headers = \ %D%/libutil/util.hh \ %D%/libutil/archive.hh \ %D%/libutil/types.hh \ - %D%/libutil/spawn.hh + %D%/libutil/spawn.hh \ + %D%/libutil/seccomp.hh libutil_a_CPPFLAGS = \ -I$(top_builddir)/nix \ |