diff options
Diffstat (limited to 'guix/scripts/offload.scm')
-rw-r--r-- | guix/scripts/offload.scm | 128 |
1 files changed, 82 insertions, 46 deletions
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index ee5857e16b..1e0ea1c4c6 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -321,6 +321,13 @@ hook." (set-port-revealed! port 1) port)) +(define (node-free-disk-space node) + "Return the free disk space, in bytes, in NODE's store." + (node-eval node + `(begin + (use-modules (guix build syscalls)) + (free-disk-space ,(%store-prefix))))) + (define* (transfer-and-offload drv machine #:key (inputs '()) @@ -360,9 +367,19 @@ MACHINE." (derivation-file-name drv) (build-machine-name machine) (nix-protocol-error-message c)) - ;; Use exit code 100 for a permanent build failure. The daemon - ;; interprets other non-zero codes as transient build failures. - (primitive-exit 100))) + (let* ((space (false-if-exception + (node-free-disk-space (make-node session))))) + + ;; Use exit code 100 for a permanent build failure. The daemon + ;; interprets other non-zero codes as transient build failures. + (if (and space (< space (* 10 (expt 2 20)))) + (begin + (format (current-error-port) + (G_ "build failure may have been caused by lack \ +of free disk space on '~a'~%") + (build-machine-name machine)) + (primitive-exit 1)) + (primitive-exit 100))))) (parameterize ((current-build-output-port (build-log-port))) (build-derivations store (list drv)))) @@ -392,33 +409,37 @@ MACHINE." (build-requirements-features requirements) (build-machine-features machine)))) -(define (machine-load machine) - "Return the load of MACHINE, divided by the number of parallel builds -allowed on MACHINE. Return +∞ if MACHINE is unreachable." - ;; Note: This procedure is costly since it creates a new SSH session. - (match (false-if-exception (open-ssh-session machine)) - ((? session? session) - (let* ((pipe (open-remote-pipe* session OPEN_READ - "cat" "/proc/loadavg")) - (line (read-line pipe))) - (close-port pipe) - (disconnect! session) +(define %minimum-disk-space + ;; Minimum disk space required on the build machine for a build to be + ;; offloaded. This keeps us from offloading to machines that are bound to + ;; run out of disk space. + (* 100 (expt 2 20))) ;100 MiB - (if (eof-object? line) - +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded - (match (string-tokenize line) - ((one five fifteen . x) - (let* ((raw (string->number one)) - (jobs (build-machine-parallel-builds machine)) - (normalized (/ raw jobs))) - (format (current-error-port) "load on machine '~a' is ~s\ +(define (node-load node) + "Return the load on NODE. Return +∞ if NODE is misbehaving." + (let ((line (node-eval node + '(begin + (use-modules (ice-9 rdelim)) + (call-with-input-file "/proc/loadavg" + read-string))))) + (if (eof-object? line) + +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded + (match (string-tokenize line) + ((one five fifteen . x) + (string->number one)) + (x + +inf.0))))) + +(define (normalized-load machine load) + "Divide LOAD by the number of parallel builds of MACHINE." + (if (rational? load) + (let* ((jobs (build-machine-parallel-builds machine)) + (normalized (/ load jobs))) + (format (current-error-port) "load on machine '~a' is ~s\ (normalized: ~s)~%" - (build-machine-name machine) raw normalized) - normalized)) - (x - +inf.0))))) ;something's fishy about MACHINE, so avoid it - (x - +inf.0))) ;failed to connect to MACHINE, so avoid it + (build-machine-name machine) load normalized) + normalized) + load)) (define (machine-lock-file machine hint) "Return the name of MACHINE's lock file for HINT." @@ -484,21 +505,32 @@ slot (which must later be released with 'release-build-slot'), or #f and #f." (match machines+slots (((best slot) others ...) ;; Return the best machine unless it's already overloaded. - ;; Note: We call 'machine-load' only as a last resort because it is + ;; Note: We call 'node-load' only as a last resort because it is ;; too costly to call it once for every machine. - (if (< (machine-load best) 2.) - (match others - (((machines slots) ...) - ;; Release slots from the uninteresting machines. - (for-each release-build-slot slots) + (let* ((session (false-if-exception (open-ssh-session best))) + (node (and session (make-node session))) + (load (and node (normalized-load best (node-load node)))) + (space (and node (node-free-disk-space node)))) + (when session (disconnect! session)) + (if (and node (< load 2.) (>= space %minimum-disk-space)) + (match others + (((machines slots) ...) + ;; Release slots from the uninteresting machines. + (for-each release-build-slot slots) - ;; The caller must keep SLOT to protect it from GC and to - ;; eventually release it. - (values best slot))) - (begin - ;; BEST is overloaded, so try the next one. - (release-build-slot slot) - (loop others)))) + ;; The caller must keep SLOT to protect it from GC and to + ;; eventually release it. + (values best slot))) + (begin + ;; BEST is unsuitable, so try the next one. + (when (and space (< space %minimum-disk-space)) + (format (current-error-port) + "skipping machine '~a' because it is low \ +on disk space (~,2f MiB free)~%" + (build-machine-name best) + (/ space (expt 2 20) 1.))) + (release-build-slot slot) + (loop others))))) (() (values #f #f)))))) @@ -689,16 +721,20 @@ machine." (info (G_ "getting status of ~a build machines defined in '~a'...~%") (length machines) machine-file) (for-each (lambda (machine) - (let* ((node (make-node (open-ssh-session machine))) - (uts (node-eval node '(uname)))) + (let* ((session (open-ssh-session machine)) + (node (make-node session)) + (uts (node-eval node '(uname))) + (load (node-load node)) + (free (node-free-disk-space node))) + (disconnect! session) (format #t "~a~% kernel: ~a ~a~% architecture: ~a~%\ - host name: ~a~% normalized load: ~a~%" + host name: ~a~% normalized load: ~a~% free disk space: ~,2f MiB~%" (build-machine-name machine) (utsname:sysname uts) (utsname:release uts) (utsname:machine uts) (utsname:nodename uts) - (parameterize ((current-error-port (%make-void-port "rw+"))) - (machine-load machine))))) + load + (/ free (expt 2 20) 1.)))) machines))) |