diff options
Diffstat (limited to 'guix/scripts/offload.scm')
-rw-r--r-- | guix/scripts/offload.scm | 411 |
1 files changed, 212 insertions, 199 deletions
diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index 2e0268020c..6a4ae28689 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -24,8 +24,10 @@ #:use-module (ssh popen) #:use-module (ssh dist) #:use-module (ssh dist node) + #:use-module (ssh version) #:use-module (guix config) #:use-module (guix records) + #:use-module (guix ssh) #:use-module (guix store) #:use-module (guix derivations) #:use-module ((guix serialization) @@ -74,6 +76,10 @@ (private-key build-machine-private-key ; file name (default (user-openssh-private-key))) (host-key build-machine-host-key) ; string + (compression build-machine-compression ; string + (default "zlib@openssh.com,zlib")) + (compression-level build-machine-compression-level ;integer + (default 3)) (daemon-socket build-machine-daemon-socket ; string (default "/var/guix/daemon-socket/socket")) (parallel-builds build-machine-parallel-builds ; number @@ -168,86 +174,53 @@ private key from '~a': ~a") (session (make-session #:user (build-machine-user machine) #:host (build-machine-name machine) #:port (build-machine-port machine) - #:timeout 5 ;seconds + #:timeout 10 ;seconds ;; #:log-verbosity 'protocol #:identity (build-machine-private-key machine) + ;; By default libssh reads ~/.ssh/known_hosts + ;; and uses that to adjust its choice of cipher + ;; suites, which changes the type of host key + ;; that the server sends (RSA vs. Ed25519, + ;; etc.). Opt for something reproducible and + ;; stateless instead. + #:knownhosts "/dev/null" + ;; We need lightweight compression when ;; exchanging full archives. - #:compression "zlib" - #:compression-level 3))) - (connect! session) - - ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about - ;; ed25519 keys and 'get-key-type' returns #f in that case. - (let-values (((server) (get-server-public-key session)) - ((type key) (host-key->type+key - (build-machine-host-key machine)))) - (unless (and (or (not (get-key-type server)) - (eq? (get-key-type server) type)) - (string=? (public-key->string server) key)) - ;; Key mismatch: something's wrong. XXX: It could be that the server - ;; provided its Ed25519 key when we where expecting its RSA key. - (leave (_ "server at '~a' returned host key '~a' of type '~a' \ + #:compression + (build-machine-compression machine) + #:compression-level + (build-machine-compression-level machine)))) + (match (connect! session) + ('ok + ;; Authenticate the server. XXX: Guile-SSH 0.10.1 doesn't know about + ;; ed25519 keys and 'get-key-type' returns #f in that case. + (let-values (((server) (get-server-public-key session)) + ((type key) (host-key->type+key + (build-machine-host-key machine)))) + (unless (and (or (not (get-key-type server)) + (eq? (get-key-type server) type)) + (string=? (public-key->string server) key)) + ;; Key mismatch: something's wrong. XXX: It could be that the server + ;; provided its Ed25519 key when we where expecting its RSA key. + (leave (_ "server at '~a' returned host key '~a' of type '~a' \ instead of '~a' of type '~a'~%") - (build-machine-name machine) - (public-key->string server) (get-key-type server) - key type))) - - (let ((auth (userauth-public-key! session private))) - (unless (eq? 'success auth) - (disconnect! session) - (leave (_ "SSH public key authentication failed for '~a': ~a~%") - (build-machine-name machine) (get-error session)))) - - session)) - -(define* (connect-to-remote-daemon session - #:optional - (socket-name "/var/guix/daemon-socket/socket")) - "Connect to the remote build daemon listening on SOCKET-NAME over SESSION, -an SSH session. Return a <nix-server> object." - (define redirect - ;; Code run in SESSION to redirect the remote process' stdin/stdout to the - ;; daemon's socket, à la socat. The SSH protocol supports forwarding to - ;; Unix-domain sockets but libssh doesn't have an API for that, hence this - ;; hack. - `(begin - (use-modules (ice-9 match) (rnrs io ports)) - - (let ((sock (socket AF_UNIX SOCK_STREAM 0)) - (stdin (current-input-port)) - (stdout (current-output-port))) - (setvbuf stdin _IONBF) - (setvbuf stdout _IONBF) - (connect sock AF_UNIX ,socket-name) - - (let loop () - (match (select (list stdin sock) '() (list stdin stdout sock)) - ((reads writes ()) - (when (memq stdin reads) - (match (get-bytevector-some stdin) - ((? eof-object?) - (primitive-exit 0)) - (bv - (put-bytevector sock bv)))) - (when (memq sock reads) - (match (get-bytevector-some sock) - ((? eof-object?) - (primitive-exit 0)) - (bv - (put-bytevector stdout bv)))) - (loop)) - (_ - (primitive-exit 1))))))) - - (let ((channel - (open-remote-pipe* session OPEN_BOTH - ;; Sort-of shell-quote REDIRECT. - "guile" "-c" - (object->string - (object->string redirect))))) - (open-connection #:port channel))) + (build-machine-name machine) + (public-key->string server) (get-key-type server) + key type))) + + (let ((auth (userauth-public-key! session private))) + (unless (eq? 'success auth) + (disconnect! session) + (leave (_ "SSH public key authentication failed for '~a': ~a~%") + (build-machine-name machine) (get-error session)))) + + session) + (x + ;; Connection failed or timeout expired. + (leave (_ "failed to connect to '~a': ~a~%") + (build-machine-name machine) (get-error session)))))) ;;; @@ -363,8 +336,9 @@ MACHINE." ;; Protect DRV from garbage collection. (add-temp-root store (derivation-file-name drv)) - (send-files (cons (derivation-file-name drv) inputs) - store) + (with-store local + (send-files local (cons (derivation-file-name drv) inputs) store + #:log-port (current-output-port))) (format (current-error-port) "offloading '~a' to '~a'...~%" (derivation-file-name drv) (build-machine-name machine)) (format (current-error-port) "@ build-remote ~a ~a~%" @@ -379,92 +353,20 @@ MACHINE." ;; Use exit code 100 for a permanent build failure. The daemon ;; interprets other non-zero codes as transient build failures. (primitive-exit 100))) - (build-derivations store (list drv))) + (parameterize ((current-build-output-port (build-log-port))) + (build-derivations store (list drv)))) - (retrieve-files outputs store) + (retrieve-files* outputs store) (format (current-error-port) "done with offloaded '~a'~%" (derivation-file-name drv))) -(define (store-import-channel session) - "Return an output port to which archives to be exported to SESSION's store -can be written." - ;; Using the 'import-paths' RPC on a remote store would be slow because it - ;; makes a round trip every time 32 KiB have been transferred. This - ;; procedure instead opens a separate channel to use the remote - ;; 'import-paths' procedure, which consumes all the data in a single round - ;; trip. - (define import - `(begin - (use-modules (guix)) - - (with-store store - (setvbuf (current-input-port) _IONBF) - (import-paths store (current-input-port))))) - - (open-remote-output-pipe session - (string-join - `("guile" "-c" - ,(object->string - (object->string import)))))) - -(define (store-export-channel session files) - "Return an input port from which an export of FILES from SESSION's store can -be read." - ;; Same as above: this is more efficient than calling 'export-paths' on a - ;; remote store. - (define export - `(begin - (use-modules (guix)) - - (with-store store - (setvbuf (current-output-port) _IONBF) - (export-paths store ',files (current-output-port))))) - - (open-remote-input-pipe session - (string-join - `("guile" "-c" - ,(object->string - (object->string export)))))) - -(define (send-files files remote) - "Send the subset of FILES that's missing to REMOTE, a remote store." - (with-store store - ;; Compute the subset of FILES missing on SESSION, and send them in - ;; topologically sorted order so that they can actually be imported. - (let* ((sorted (topologically-sorted store files)) - (session (channel-get-session (nix-server-socket remote))) - (node (make-node session)) - (missing (node-eval node - `(begin - (use-modules (guix) - (srfi srfi-1) (srfi srfi-26)) - - (with-store store - (remove (cut valid-path? store <>) - ',sorted))))) - (port (store-import-channel session))) - (format #t (_ "sending ~a store files to '~a'...~%") - (length missing) (session-get session 'host)) - - (export-paths store missing port) - - ;; Tell the remote process that we're done. (In theory the - ;; end-of-archive mark of 'export-paths' would be enough, but in - ;; practice it's not.) - (channel-send-eof port) - - ;; Wait for completion of the remote process. - (let ((result (zero? (channel-get-exit-status port)))) - (close-port port) - result)))) - -(define (retrieve-files files remote) - "Retrieve FILES from SESSION's store, and import them." - (let* ((session (channel-get-session (nix-server-socket remote))) - (host (session-get session 'host)) - (port (store-export-channel session files))) - (format #t (_ "retrieving ~a files from '~a'...~%") - (length files) host) +(define (retrieve-files* files remote) + "Retrieve FILES from REMOTE and import them using 'restore-file-set'." + (let-values (((port count) + (file-retrieval-port files remote))) + (format #t (N_ "retrieving ~a store item from '~a'...~%" + "retrieving ~a store items from '~a'...~%" count) + count (remote-store-host remote)) ;; We cannot use the 'import-paths' RPC here because we already ;; hold the locks for FILES. @@ -489,37 +391,30 @@ be read." (define (machine-load machine) "Return the load of MACHINE, divided by the number of parallel builds -allowed on MACHINE." - (let* ((session (open-ssh-session machine)) - (pipe (open-remote-pipe* session OPEN_READ +allowed on MACHINE. Return +∞ if MACHINE is unreachable." + ;; Note: This procedure is costly since it creates a new SSH session. + (match (false-if-exception (open-ssh-session machine)) + ((? session? session) + (let* ((pipe (open-remote-pipe* session OPEN_READ "cat" "/proc/loadavg")) - (line (read-line pipe))) - (close-port pipe) - - (if (eof-object? line) - +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded - (match (string-tokenize line) - ((one five fifteen . _) - (let* ((raw (string->number five)) - (jobs (build-machine-parallel-builds machine)) - (normalized (/ raw jobs))) - (format (current-error-port) "load on machine '~a' is ~s\ + (line (read-line pipe))) + (close-port pipe) + + (if (eof-object? line) + +inf.0 ;MACHINE does not respond, so assume it is infinitely loaded + (match (string-tokenize line) + ((one five fifteen . _) + (let* ((raw (string->number five)) + (jobs (build-machine-parallel-builds machine)) + (normalized (/ raw jobs))) + (format (current-error-port) "load on machine '~a' is ~s\ (normalized: ~s)~%" - (build-machine-name machine) raw normalized) - normalized)) - (_ - +inf.0))))) ;something's fishy about MACHINE, so avoid it - -(define (machine-power-factor m) - "Return a factor that aggregates the speed and load of M. The higher the -better." - (/ (build-machine-speed m) - (+ 1 (machine-load m)))) - -(define (machine-less-loaded-or-faster? m1 m2) - "Return #t if M1 is either less loaded or faster than M2. (This relation -defines a total order on machines.)" - (> (machine-power-factor m1) (machine-power-factor m2))) + (build-machine-name machine) raw normalized) + normalized)) + (_ + +inf.0))))) ;something's fishy about MACHINE, so avoid it + (_ + +inf.0))) ;failed to connect to MACHINE, so avoid it (define (machine-lock-file machine hint) "Return the name of MACHINE's lock file for HINT." @@ -548,29 +443,39 @@ defines a total order on machines.)" ;; 5. Release the global machine-choice lock. (with-file-lock (machine-choice-lock-file) - (define machines+slots + (define machines+slots+loads (filter-map (lambda (machine) + ;; Call 'machine-load' from here to make sure it is called + ;; only once per machine (it is expensive). (let ((slot (acquire-build-slot machine))) - (and slot (list machine slot)))) + (and slot + (list machine slot (machine-load machine))))) machines)) (define (undecorate pred) (lambda (a b) (match a - ((machine1 slot1) + ((machine1 slot1 load1) (match b - ((machine2 slot2) - (pred machine1 machine2))))))) - - (let loop ((machines+slots - (sort machines+slots + ((machine2 slot2 load2) + (pred machine1 load1 machine2 load2))))))) + + (define (machine-less-loaded-or-faster? m1 l1 m2 l2) + ;; Return #t if M1 is either less loaded or faster than M2, with L1 + ;; being the load of M1 and L2 the load of M2. (This relation defines a + ;; total order on machines.) + (> (/ (build-machine-speed m1) (+ 1 l1)) + (/ (build-machine-speed m2) (+ 1 l2)))) + + (let loop ((machines+slots+loads + (sort machines+slots+loads (undecorate machine-less-loaded-or-faster?)))) - (match machines+slots - (((best slot) others ...) + (match machines+slots+loads + (((best slot load) others ...) ;; Return the best machine unless it's already overloaded. - (if (< (machine-load best) 2.) + (if (< load 2.) (match others - (((machines slots) ...) + (((machines slots loads) ...) ;; Release slots from the uninteresting machines. (for-each release-build-slot slots) @@ -618,6 +523,96 @@ defines a total order on machines.)" ;;; +;;; Installation tests. +;;; + +(define (assert-node-repl node name) + "Bail out if NODE is not running Guile." + (match (node-guile-version node) + (#f + (leave (_ "Guile could not be started on '~a'~%") + name)) + ((? string? version) + ;; Note: The version string already contains the word "Guile". + (info (_ "'~a' is running ~a~%") + name (node-guile-version node))))) + +(define (assert-node-has-guix node name) + "Bail out if NODE lacks the (guix) module, or if its daemon is not running." + (match (node-eval node + '(begin + (use-modules (guix)) + (with-store store + (add-text-to-store store "test" + "Hello, build machine!")))) + ((? string? str) + (info (_ "Guix is usable on '~a' (test returned ~s)~%") + name str)) + (x + (leave (_ "failed to use Guix module on '~a' (test returned ~s)~%") + name x)))) + +(define %random-state + (delay + (seed->random-state (logxor (getpid) (car (gettimeofday)))))) + +(define* (nonce #:optional (name (gethostname))) + (string-append name "-" + (number->string (random 1000000 (force %random-state))))) + +(define (assert-node-can-import node name daemon-socket) + "Bail out if NODE refuses to import our archives." + (let ((session (node-session node))) + (with-store store + (let* ((item (add-text-to-store store "export-test" (nonce))) + (remote (connect-to-remote-daemon session daemon-socket))) + (with-store local + (send-files local (list item) remote)) + + (if (valid-path? remote item) + (info (_ "'~a' successfully imported '~a'~%") + name item) + (leave (_ "'~a' was not properly imported on '~a'~%") + item name)))))) + +(define (assert-node-can-export node name daemon-socket) + "Bail out if we cannot import signed archives from NODE." + (let* ((session (node-session node)) + (remote (connect-to-remote-daemon session daemon-socket)) + (item (add-text-to-store remote "import-test" (nonce name)))) + (with-store store + (if (and (retrieve-files store (list item) remote) + (valid-path? store item)) + (info (_ "successfully imported '~a' from '~a'~%") + item name) + (leave (_ "failed to import '~a' from '~a'~%") + item name))))) + +(define (check-machine-availability machine-file pred) + "Check that each machine matching PRED in MACHINE-FILE is usable as a build +machine." + (define (build-machine=? m1 m2) + (and (string=? (build-machine-name m1) (build-machine-name m2)) + (= (build-machine-port m1) (build-machine-port m2)))) + + ;; A given build machine may appear several times (e.g., once for + ;; "x86_64-linux" and a second time for "i686-linux"); test them only once. + (let ((machines (filter pred + (delete-duplicates (build-machines machine-file) + build-machine=?)))) + (info (_ "testing ~a build machines defined in '~a'...~%") + (length machines) machine-file) + (let* ((names (map build-machine-name machines)) + (sockets (map build-machine-daemon-socket machines)) + (sessions (map open-ssh-session machines)) + (nodes (map make-node sessions))) + (for-each assert-node-repl nodes names) + (for-each assert-node-has-guix nodes names) + (for-each assert-node-can-import nodes names sockets) + (for-each assert-node-can-export nodes names sockets)))) + + +;;; ;;; Entry point. ;;; @@ -635,6 +630,12 @@ defines a total order on machines.)" (and=> (passwd:dir (getpw (getuid))) (cut setenv "HOME" <>)) + ;; We rely on protocol-level compression from libssh to optimize large data + ;; transfers. Warn if it's missing. + (unless (zlib-support?) + (warning (_ "Guile-SSH lacks zlib support")) + (warning (_ "data transfers will *not* be compressed!"))) + (match args ((system max-silent-time print-build-trace? build-timeout) (let ((max-silent-time (string->number max-silent-time)) @@ -660,6 +661,18 @@ defines a total order on machines.)" (else (leave (_ "invalid request line: ~s~%") line))) (loop (read-line))))))) + (("test" rest ...) + (with-error-handling + (let-values (((file pred) + (match rest + ((file regexp) + (values file + (compose (cut string-match regexp <>) + build-machine-name))) + ((file) (values file (const #t))) + (() (values %machine-file (const #t))) + (_ (leave (_ "wrong number of arguments~%")))))) + (check-machine-availability (or file %machine-file) pred)))) (("--version") (show-version-and-exit "guix offload")) (("--help") |