summaryrefslogtreecommitdiff
path: root/gnu/build/linux-container.scm
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/build/linux-container.scm')
-rw-r--r--gnu/build/linux-container.scm241
1 files changed, 137 insertions, 104 deletions
diff --git a/gnu/build/linux-container.scm b/gnu/build/linux-container.scm
index 0df51c390b..ff5449d0b0 100644
--- a/gnu/build/linux-container.scm
+++ b/gnu/build/linux-container.scm
@@ -190,7 +190,10 @@ for the process."
(remount-read-only "/"))))
(define* (initialize-user-namespace pid host-uids
- #:key (guest-uid 0) (guest-gid 0))
+ #:key
+ (host-uid (getuid))
+ (host-gid (getgid))
+ (guest-uid 0) (guest-gid 0))
"Configure the user namespace for PID. HOST-UIDS specifies the number of
host user identifiers to map into the user namespace. GUEST-UID and GUEST-GID
specify the first UID (respectively GID) that host UIDs (respectively GIDs)
@@ -201,24 +204,21 @@ map to in the namespace."
(define (scope file)
(string-append proc-dir file))
- (let ((uid (getuid))
- (gid (getgid)))
-
- ;; Only root can write to the gid map without first disabling the
- ;; setgroups syscall.
- (unless (and (zero? uid) (zero? gid))
- (call-with-output-file (scope "/setgroups")
- (lambda (port)
- (display "deny" port))))
-
- ;; Map the user/group that created the container to the root user
- ;; within the container.
- (call-with-output-file (scope "/uid_map")
- (lambda (port)
- (format port "~d ~d ~d" guest-uid uid host-uids)))
- (call-with-output-file (scope "/gid_map")
+ ;; Only root can write to the gid map without first disabling the
+ ;; setgroups syscall.
+ (unless (and (zero? host-uid) (zero? host-gid))
+ (call-with-output-file (scope "/setgroups")
(lambda (port)
- (format port "~d ~d ~d" guest-gid gid host-uids)))))
+ (display "deny" port))))
+
+ ;; Map the user/group that created the container to the root user
+ ;; within the container.
+ (call-with-output-file (scope "/uid_map")
+ (lambda (port)
+ (format port "~d ~d ~d" guest-uid host-uid host-uids)))
+ (call-with-output-file (scope "/gid_map")
+ (lambda (port)
+ (format port "~d ~d ~d" guest-gid host-gid host-uids))))
(define (namespaces->bit-mask namespaces)
"Return the number suitable for the 'flags' argument of 'clone' that
@@ -239,12 +239,14 @@ corresponds to the symbols in NAMESPACES."
#:key (guest-uid 0) (guest-gid 0)
(populate-file-system (const #t))
(loopback-network? #t)
+ (lock-mounts? #t)
writable-root?)
"Run THUNK in a new container process and return its PID. ROOT specifies
the root directory for the container. MOUNTS is a list of <file-system>
objects that specify file systems to mount inside the container. NAMESPACES
is a list of symbols that correspond to the possible Linux namespaces: mnt,
-ipc, uts, user, and net.
+ipc, uts, user, and net. When LOCK-MOUNTS? is true, arrange so that none of
+MOUNTS can be unmounted or remounted individually from within THUNK.
When LOOPBACK-NETWORK? is true and 'net is amount NAMESPACES, set up the
loopback device (\"lo\") and a minimal /etc/hosts.
@@ -261,75 +263,93 @@ that host UIDs (respectively GIDs) map to in the namespace."
;; child process blocks until the parent writes to it.
(match (socketpair PF_UNIX (logior SOCK_CLOEXEC SOCK_STREAM) 0)
((child . parent)
- (let ((flags (namespaces->bit-mask namespaces)))
- (match (clone flags)
- (0
- (call-with-clean-exit
- (lambda ()
- (close-port parent)
- ;; Wait for parent to set things up.
- (match (read child)
- ('ready
- (purify-environment)
- (when (and (memq 'mnt namespaces)
- (not (string=? root "/")))
- (catch #t
- (lambda ()
- (mount-file-systems root mounts
- #:mount-/proc? (memq 'pid namespaces)
- #:mount-/sys? (memq 'net
- namespaces)
- #:populate-file-system
- (lambda ()
- (populate-file-system)
- (when (and (memq 'net namespaces)
- loopback-network?)
- (set-network-interface-up "lo")
+ (safe-clone
+ (namespaces->bit-mask namespaces)
+ (lambda ()
+ (call-with-clean-exit
+ (lambda ()
+ (close-port parent)
+ ;; Wait for parent to set things up.
+ (match (read child)
+ ('ready
+ (purify-environment)
+ (when (and (memq 'mnt namespaces)
+ (not (string=? root "/")))
+ (catch #t
+ (lambda ()
+ (mount-file-systems root mounts
+ #:mount-/proc? (memq 'pid namespaces)
+ #:mount-/sys? (memq 'net
+ namespaces)
+ #:populate-file-system
+ (lambda ()
+ (populate-file-system)
+ (when (and (memq 'net namespaces)
+ loopback-network?)
+ (set-network-interface-up "lo")
+
+ ;; When isolated from the
+ ;; network, provide a minimal
+ ;; /etc/hosts to resolve
+ ;; "localhost".
+ (mkdir-p "/etc")
+ (call-with-output-file "/etc/hosts"
+ (lambda (port)
+ (display "127.0.0.1 localhost\n" port)
+ (chmod port #o444)))))
+ #:writable-root?
+ (or writable-root?
+ (not (memq 'mnt namespaces)))))
+ (lambda args
+ ;; Forward the exception to the parent process.
+ ;; FIXME: SRFI-35 conditions and non-trivial objects
+ ;; cannot be 'read' so they shouldn't be written as is.
+ (write args child)
+ (primitive-exit 3))))
- ;; When isolated from the
- ;; network, provide a minimal
- ;; /etc/hosts to resolve
- ;; "localhost".
- (mkdir-p "/etc")
- (call-with-output-file "/etc/hosts"
- (lambda (port)
- (display "127.0.0.1 localhost\n" port)
- (chmod port #o444)))))
- #:writable-root?
- (or writable-root?
- (not (memq 'mnt namespaces)))))
- (lambda args
- ;; Forward the exception to the parent process.
- ;; FIXME: SRFI-35 conditions and non-trivial objects
- ;; cannot be 'read' so they shouldn't be written as is.
- (write args child)
- (primitive-exit 3))))
- ;; TODO: Manage capabilities.
- (write 'ready child)
- (close-port child)
- (thunk))
- (_ ;parent died or something
- (primitive-exit 2))))))
- (pid
- (close-port child)
- (when (memq 'user namespaces)
- (initialize-user-namespace pid host-uids
- #:guest-uid guest-uid
- #:guest-gid guest-gid))
- ;; TODO: Initialize cgroups.
- (write 'ready parent)
- (newline parent)
+ (when (and lock-mounts?
+ (memq 'mnt namespaces)
+ (memq 'user namespaces))
+ ;; Create a new mount namespace owned by a new user
+ ;; namespace to "lock" together previous mounts, such that
+ ;; they cannot be unmounted or remounted separately--see
+ ;; mount_namespaces(7).
+ (let ((uid (getuid)) (gid (getgid)))
+ (unshare (logior CLONE_NEWUSER CLONE_NEWNS))
+ (when (file-exists? "/proc/self")
+ (initialize-user-namespace (getpid)
+ host-uids
+ #:host-uid uid
+ #:host-gid gid
+ #:guest-uid guest-uid
+ #:guest-gid guest-gid))))
- ;; Check whether the child process' setup phase succeeded.
- (let ((message (read parent)))
- (close-port parent)
- (match message
- ('ready ;success
- pid)
- (((? symbol? key) args ...) ;exception
- (apply throw key args))
- (_ ;unexpected termination
- #f)))))))))
+ ;; TODO: Manage capabilities.
+ (write 'ready child)
+ (close-port child)
+ (thunk))
+ (_ ;parent died or something
+ (primitive-exit 2))))))
+ (lambda (pid)
+ (close-port child)
+ (when (memq 'user namespaces)
+ (initialize-user-namespace pid host-uids
+ #:guest-uid guest-uid
+ #:guest-gid guest-gid))
+ ;; TODO: Initialize cgroups.
+ (write 'ready parent)
+ (newline parent)
+
+ ;; Check whether the child process' setup phase succeeded.
+ (let ((message (read parent)))
+ (close-port parent)
+ (match message
+ ('ready ;success
+ pid)
+ (((? symbol? key) args ...) ;exception
+ (apply throw key args))
+ (_ ;unexpected termination
+ #f))))))))
;; FIXME: This is copied from (guix utils), which we cannot use because it
;; would pull (guix config) and all.
@@ -376,6 +396,7 @@ if there are no child processes left."
(define* (call-with-container mounts thunk #:key (namespaces %namespaces)
(host-uids 1) (guest-uid 0) (guest-gid 0)
+ (lock-mounts? #t)
(relayed-signals (list SIGINT SIGTERM))
(child-is-pid1? #t)
(populate-file-system (const #t))
@@ -460,6 +481,7 @@ load path must be adjusted as needed."
(call-with-temporary-directory
(lambda (root)
(let ((pid (run-container root mounts namespaces host-uids thunk*
+ #:lock-mounts? lock-mounts?
#:guest-uid guest-uid
#:guest-gid guest-gid
#:populate-file-system populate-file-system
@@ -480,24 +502,35 @@ return the exit status, an integer as returned by 'waitpid'."
(0
(call-with-clean-exit
(lambda ()
- (for-each (lambda (ns)
- (let ((source (namespace-file (getpid) ns))
- (target (namespace-file pid ns)))
- ;; Joining the namespace that the process already
- ;; belongs to would throw an error so avoid that.
- ;; XXX: This /proc interface leads to TOCTTOU.
- (unless (string=? (readlink source) (readlink target))
- (call-with-input-file source
- (lambda (current-ns-port)
- (call-with-input-file target
- (lambda (new-ns-port)
- (setns (fileno new-ns-port) 0))))))))
- ;; It's important that the user namespace is joined first,
- ;; so that the user will have the privileges to join the
- ;; other namespaces. Furthermore, it's important that the
- ;; mount namespace is joined last, otherwise the /proc mount
- ;; point would no longer be accessible.
- '("user" "ipc" "uts" "net" "pid" "mnt"))
+ ;; First, determine the user namespace that owns the pid namespace and
+ ;; join that user namespace (the assumption is that it also owns all
+ ;; the other namespaces). It's important that the user namespace is
+ ;; joined first, so that the user will have the privileges to join the
+ ;; other namespaces.
+ (let* ((pid-ns (open-fdes (namespace-file pid "pid")
+ (logior O_CLOEXEC O_RDONLY)))
+ (user-ns (get-user-ns pid-ns)))
+ (close-fdes pid-ns)
+ (unless (equal? (stat user-ns)
+ (stat (namespace-file (getpid) "user")))
+ (setns user-ns 0))
+ (close-fdes user-ns)
+
+ ;; Then join all the remaining namespaces.
+ (for-each (lambda (ns)
+ (let ((source (namespace-file (getpid) ns))
+ (target (namespace-file pid ns)))
+ ;; Joining the namespace that the process already
+ ;; belongs to would throw an error so avoid that.
+ ;; XXX: This /proc interface leads to TOCTTOU.
+ (unless (string=? (readlink source) (readlink target))
+ (call-with-input-file target
+ (lambda (new-ns-port)
+ (setns (fileno new-ns-port) 0))))))
+ ;; It's important that the mount namespace is joined last,
+ ;; otherwise the /proc mount point would no longer be
+ ;; accessible.
+ '("ipc" "uts" "net" "pid" "mnt")))
(purify-environment)
(chdir "/")