summaryrefslogtreecommitdiff
path: root/guix/scripts/offload.scm
diff options
context:
space:
mode:
Diffstat (limited to 'guix/scripts/offload.scm')
-rw-r--r--guix/scripts/offload.scm128
1 files changed, 82 insertions, 46 deletions
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm
index ee5857e16b..1e0ea1c4c6 100644
--- a/guix/scripts/offload.scm
+++ b/guix/scripts/offload.scm
@@ -321,6 +321,13 @@ hook."
(set-port-revealed! port 1)
port))
+(define (node-free-disk-space node)
+ "Return the free disk space, in bytes, in NODE's store."
+ (node-eval node
+ `(begin
+ (use-modules (guix build syscalls))
+ (free-disk-space ,(%store-prefix)))))
+
(define* (transfer-and-offload drv machine
#:key
(inputs '())
@@ -360,9 +367,19 @@ MACHINE."
(derivation-file-name drv)
(build-machine-name machine)
(nix-protocol-error-message c))
- ;; Use exit code 100 for a permanent build failure. The daemon
- ;; interprets other non-zero codes as transient build failures.
- (primitive-exit 100)))
+ (let* ((space (false-if-exception
+ (node-free-disk-space (make-node session)))))
+
+ ;; Use exit code 100 for a permanent build failure. The daemon
+ ;; interprets other non-zero codes as transient build failures.
+ (if (and space (< space (* 10 (expt 2 20))))
+ (begin
+ (format (current-error-port)
+ (G_ "build failure may have been caused by lack \
+of free disk space on '~a'~%")
+ (build-machine-name machine))
+ (primitive-exit 1))
+ (primitive-exit 100)))))
(parameterize ((current-build-output-port (build-log-port)))
(build-derivations store (list drv))))
@@ -392,33 +409,37 @@ MACHINE."
(build-requirements-features requirements)
(build-machine-features machine))))
-(define (machine-load machine)
- "Return the load of MACHINE, divided by the number of parallel builds
-allowed on MACHINE. Return +∞ if MACHINE is unreachable."
- ;; Note: This procedure is costly since it creates a new SSH session.
- (match (false-if-exception (open-ssh-session machine))
- ((? session? session)
- (let* ((pipe (open-remote-pipe* session OPEN_READ
- "cat" "/proc/loadavg"))
- (line (read-line pipe)))
- (close-port pipe)
- (disconnect! session)
+(define %minimum-disk-space
+ ;; Minimum disk space required on the build machine for a build to be
+ ;; offloaded. This keeps us from offloading to machines that are bound to
+ ;; run out of disk space.
+ (* 100 (expt 2 20))) ;100 MiB
- (if (eof-object? line)
- +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
- (match (string-tokenize line)
- ((one five fifteen . x)
- (let* ((raw (string->number one))
- (jobs (build-machine-parallel-builds machine))
- (normalized (/ raw jobs)))
- (format (current-error-port) "load on machine '~a' is ~s\
+(define (node-load node)
+ "Return the load on NODE. Return +∞ if NODE is misbehaving."
+ (let ((line (node-eval node
+ '(begin
+ (use-modules (ice-9 rdelim))
+ (call-with-input-file "/proc/loadavg"
+ read-string)))))
+ (if (eof-object? line)
+ +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded
+ (match (string-tokenize line)
+ ((one five fifteen . x)
+ (string->number one))
+ (x
+ +inf.0)))))
+
+(define (normalized-load machine load)
+ "Divide LOAD by the number of parallel builds of MACHINE."
+ (if (rational? load)
+ (let* ((jobs (build-machine-parallel-builds machine))
+ (normalized (/ load jobs)))
+ (format (current-error-port) "load on machine '~a' is ~s\
(normalized: ~s)~%"
- (build-machine-name machine) raw normalized)
- normalized))
- (x
- +inf.0))))) ;something's fishy about MACHINE, so avoid it
- (x
- +inf.0))) ;failed to connect to MACHINE, so avoid it
+ (build-machine-name machine) load normalized)
+ normalized)
+ load))
(define (machine-lock-file machine hint)
"Return the name of MACHINE's lock file for HINT."
@@ -484,21 +505,32 @@ slot (which must later be released with 'release-build-slot'), or #f and #f."
(match machines+slots
(((best slot) others ...)
;; Return the best machine unless it's already overloaded.
- ;; Note: We call 'machine-load' only as a last resort because it is
+ ;; Note: We call 'node-load' only as a last resort because it is
;; too costly to call it once for every machine.
- (if (< (machine-load best) 2.)
- (match others
- (((machines slots) ...)
- ;; Release slots from the uninteresting machines.
- (for-each release-build-slot slots)
+ (let* ((session (false-if-exception (open-ssh-session best)))
+ (node (and session (make-node session)))
+ (load (and node (normalized-load best (node-load node))))
+ (space (and node (node-free-disk-space node))))
+ (when session (disconnect! session))
+ (if (and node (< load 2.) (>= space %minimum-disk-space))
+ (match others
+ (((machines slots) ...)
+ ;; Release slots from the uninteresting machines.
+ (for-each release-build-slot slots)
- ;; The caller must keep SLOT to protect it from GC and to
- ;; eventually release it.
- (values best slot)))
- (begin
- ;; BEST is overloaded, so try the next one.
- (release-build-slot slot)
- (loop others))))
+ ;; The caller must keep SLOT to protect it from GC and to
+ ;; eventually release it.
+ (values best slot)))
+ (begin
+ ;; BEST is unsuitable, so try the next one.
+ (when (and space (< space %minimum-disk-space))
+ (format (current-error-port)
+ "skipping machine '~a' because it is low \
+on disk space (~,2f MiB free)~%"
+ (build-machine-name best)
+ (/ space (expt 2 20) 1.)))
+ (release-build-slot slot)
+ (loop others)))))
(()
(values #f #f))))))
@@ -689,16 +721,20 @@ machine."
(info (G_ "getting status of ~a build machines defined in '~a'...~%")
(length machines) machine-file)
(for-each (lambda (machine)
- (let* ((node (make-node (open-ssh-session machine)))
- (uts (node-eval node '(uname))))
+ (let* ((session (open-ssh-session machine))
+ (node (make-node session))
+ (uts (node-eval node '(uname)))
+ (load (node-load node))
+ (free (node-free-disk-space node)))
+ (disconnect! session)
(format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\
- host name: ~a~% normalized load: ~a~%"
+ host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%"
(build-machine-name machine)
(utsname:sysname uts) (utsname:release uts)
(utsname:machine uts)
(utsname:nodename uts)
- (parameterize ((current-error-port (%make-void-port "rw+")))
- (machine-load machine)))))
+ load
+ (/ free (expt 2 20) 1.))))
machines)))