From b2b9571935f9188086b2e7b434840eeda6c42805 Mon Sep 17 00:00:00 2001 From: Ludovic Courtès Date: Tue, 22 Nov 2022 07:17:17 +0100 Subject: offload: Gracefully handle 'guix repl' protocol errors. Fixes . Reported by Mathieu Othacehe . Previously, if a machine had a buggy 'guix repl', 'guix offload' would crash with a backtrace instead of just ignoring the machine. * guix/scripts/offload.scm (remote-inferior*): New procedure. (check-machine-availability)[if-true]: New procedure. Use 'remote-inferior*' and 'if-true'. (check-machine-status): Use 'remote-inferior*'. --- guix/scripts/offload.scm | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'guix/scripts/offload.scm') diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index 925325ef5f..8ab393c0ac 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -25,7 +25,7 @@ #:autoload (ssh auth) (userauth-public-key!) #:autoload (ssh session) (make-session connect! get-error - disconnect! session-set!) + disconnect! session-set! session-get) #:autoload (ssh version) (zlib-support?) #:use-module (guix config) #:use-module (guix records) @@ -34,7 +34,8 @@ send-files retrieve-files retrieve-files* remote-inferior report-guile-error) #:use-module (guix store) - #:autoload (guix inferior) (inferior-eval close-inferior inferior?) + #:autoload (guix inferior) (inferior-eval close-inferior + inferior? inferior-protocol-error?) #:autoload (guix derivations) (read-derivation-from-file derivation-file-name build-derivations) @@ -473,6 +474,15 @@ logical cores available, to give a rough estimation of CPU usage. Return (vector-set! vec j (vector-ref vec (- i 1))) (loop (cons val result) (- i 1)))))))) +(define (remote-inferior* session) + "Like 'remote-inferior', but upon error return #f." + (or (guard (c ((inferior-protocol-error? c) #f)) + (remote-inferior session)) + (begin + (warning (G_ "failed to run 'guix repl' on machine '~a'~%") + (session-get session 'host)) + #f))) + (define (choose-build-machine machines) "Return two values: the best machine among MACHINES and its build slot (which must later be released with 'release-build-slot'), or #f and #f." @@ -511,7 +521,7 @@ slot (which must later be released with 'release-build-slot'), or #f and #f." ;; too costly to call it once for every machine. (let* ((session (false-if-exception (open-ssh-session best %short-timeout))) - (node (and session (remote-inferior session))) + (node (and session (remote-inferior* session))) (load (and node (node-load node))) (threshold (build-machine-overload-threshold best)) (space (and node (node-free-disk-space node)))) @@ -708,6 +718,11 @@ machine." (and (string=? (build-machine-name m1) (build-machine-name m2)) (= (build-machine-port m1) (build-machine-port m2)))) + (define (if-true proc) + (lambda args + (when (every ->bool args) + (apply proc args)))) + ;; A given build machine may appear several times (e.g., once for ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. (let ((machines (filter pred @@ -718,12 +733,12 @@ machine." (let* ((names (map build-machine-name machines)) (sockets (map build-machine-daemon-socket machines)) (sessions (map (cut open-ssh-session <> %short-timeout) machines)) - (nodes (map remote-inferior sessions))) - (for-each assert-node-has-guix nodes names) - (for-each assert-node-repl nodes names) - (for-each assert-node-can-import sessions nodes names sockets) - (for-each assert-node-can-export sessions nodes names sockets) - (for-each close-inferior nodes) + (nodes (map remote-inferior* sessions))) + (for-each (if-true assert-node-has-guix) nodes names) + (for-each (if-true assert-node-repl) nodes names) + (for-each (if-true assert-node-can-import) sessions nodes names sockets) + (for-each (if-true assert-node-can-export) sessions nodes names sockets) + (for-each (if-true close-inferior) nodes) (for-each disconnect! sessions)))) (define (check-machine-status machine-file pred) @@ -743,10 +758,9 @@ machine." (define session (open-ssh-session machine %short-timeout)) - (match (remote-inferior session) + (match (remote-inferior* session) (#f - (warning (G_ "failed to run 'guix repl' on machine '~a'~%") - (build-machine-name machine))) + #f) ((? inferior? inferior) (let ((now (car (gettimeofday)))) (match (inferior-eval '(list (uname) -- cgit v1.2.3 From d33ed58169edc027cfb6c256ecabde87e59918ed Mon Sep 17 00:00:00 2001 From: Ludovic Courtès Date: Wed, 23 Nov 2022 19:32:04 +0100 Subject: offload: Increase default 'overload-threshold' value. When offloading to a single machine, the previous default value would lead 'guix offload' to wait possibly for several minutes between subsequent builds until normalized load would finally go below 0.6. Increasing it mitigates that. * guix/scripts/offload.scm ()[overload-threshold]: Bump to 0.8. * doc/guix.texi (Daemon Offload Setup): Likewise. --- doc/guix.texi | 2 +- guix/scripts/offload.scm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'guix/scripts/offload.scm') diff --git a/doc/guix.texi b/doc/guix.texi index c0cb24d709..f39a0aa700 100644 --- a/doc/guix.texi +++ b/doc/guix.texi @@ -1409,7 +1409,7 @@ when transferring files to and from build machines. File name of the Unix-domain socket @command{guix-daemon} is listening to on that machine. -@item @code{overload-threshold} (default: @code{0.6}) +@item @code{overload-threshold} (default: @code{0.8}) The load threshold above which a potential offload machine is disregarded by the offload scheduler. The value roughly translates to the total processor usage of the build machine, ranging from 0.0 (0%) to diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index 8ab393c0ac..578b3b9888 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -112,7 +112,7 @@ ;; A #f value tells the offload scheduler to disregard the load of the build ;; machine when selecting the best offload machine. (overload-threshold build-machine-overload-threshold ; inexact real between - (default 0.6)) ; 0.0 and 1.0 | #f + (default 0.8)) ; 0.0 and 1.0 | #f (parallel-builds build-machine-parallel-builds ; number (default 1)) (speed build-machine-speed ; inexact real -- cgit v1.2.3