summaryrefslogtreecommitdiff
path: root/guix
diff options
context:
space:
mode:
authorLudovic Courtès <ludo@gnu.org>2021-11-13 21:47:15 +0100
committerLudovic Courtès <ludo@gnu.org>2021-11-16 14:34:28 +0100
commit472a0e82a52a3d5d841e1dfad6b13e26082a5750 (patch)
treef159bd736c66d9cd7f23188fd4110727b0787637 /guix
parentf39397b21041fe418247239f27473aff49a203c9 (diff)
daemon: Do not deduplicate files smaller than 8 KiB.
Files smaller than 8 KiB typically represent ~70% of the entries in /gnu/store/.links but only contribute to ~4% of the space savings afforded by deduplication. Not considering these files for deduplication speeds up file insertion in the store and, more importantly, leaves 'removeUnusedLinks' with fewer entries to traverse, thereby speeding it up proportionally. Partly fixes <https://issues.guix.gnu.org/24937>. * config-daemon.ac: Remove symlink hard link check and CAN_LINK_SYMLINK definition. * guix/store/deduplication.scm (%deduplication-minimum-size): New variable. (deduplicate)[loop]: Do not recurse when FILE's size is below %DEDUPLICATION-MINIMUM-SIZE. (dump-port): New procedure. (dump-file/deduplicate)[hash]: Turn into... [dump-and-compute-hash]: ... this thunk. Call 'deduplicate' only when SIZE is greater than %DEDUPLICATION-MINIMUM-SIZE; otherwise call 'dump-port'. * nix/libstore/gc.cc (LocalStore::removeUnusedLinks): Drop files where st.st_size < deduplicationMinSize. * nix/libstore/local-store.hh (deduplicationMinSize): New declaration. * nix/libstore/optimise-store.cc (deduplicationMinSize): New variable. (LocalStore::optimisePath_): Return when PATH is a symlink or smaller than 'deduplicationMinSize'. * tests/derivations.scm ("identical files are deduplicated"): Produce files bigger than %DEDUPLICATION-MINIMUM-SIZE. * tests/nar.scm ("restore-file-set with directories (signed, valid)"): Likewise. * tests/store-deduplication.scm ("deduplicate, below %deduplication-minimum-size"): New test. ("deduplicate", "deduplicate, ENOSPC"): Produce files bigger than %DEDUPLICATION-MINIMUM-SIZE. * tests/store.scm ("substitute, deduplication"): Likewise.
Diffstat (limited to 'guix')
-rw-r--r--guix/store/deduplication.scm69
1 files changed, 60 insertions, 9 deletions
diff --git a/guix/store/deduplication.scm b/guix/store/deduplication.scm
index cd9660174c..370df4a74c 100644
--- a/guix/store/deduplication.scm
+++ b/guix/store/deduplication.scm
@@ -1,6 +1,6 @@
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2017 Caleb Ristvedt <caleb.ristvedt@cune.org>
-;;; Copyright © 2018, 2019, 2020 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2018-2021 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
@@ -22,12 +22,13 @@
(define-module (guix store deduplication)
#:use-module (gcrypt hash)
- #:use-module (guix build utils)
+ #:use-module ((guix build utils) #:hide (dump-port))
#:use-module (guix build syscalls)
#:use-module (guix base32)
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-34)
#:use-module (srfi srfi-35)
+ #:use-module (rnrs bytevectors)
#:use-module (rnrs io ports)
#:use-module (ice-9 ftw)
#:use-module (ice-9 match)
@@ -37,6 +38,31 @@
dump-file/deduplicate
copy-file/deduplicate))
+;; TODO: Remove once 'dump-port' in (guix build utils) has an optional 'len'
+;; parameter.
+(define* (dump-port in out
+ #:optional len
+ #:key (buffer-size 16384))
+ "Read LEN bytes from IN (or as much as possible if LEN is #f) and write it
+to OUT, using chunks of BUFFER-SIZE bytes."
+ (define buffer
+ (make-bytevector buffer-size))
+
+ (let loop ((total 0)
+ (bytes (get-bytevector-n! in buffer 0
+ (if len
+ (min len buffer-size)
+ buffer-size))))
+ (or (eof-object? bytes)
+ (and len (= total len))
+ (let ((total (+ total bytes)))
+ (put-bytevector out buffer 0 bytes)
+ (loop total
+ (get-bytevector-n! in buffer 0
+ (if len
+ (min (- len total) buffer-size)
+ buffer-size)))))))
+
(define (nar-sha256 file)
"Gives the sha256 hash of a file and the size of the file in nar form."
(let-values (((port get-hash) (open-sha256-port)))
@@ -127,11 +153,27 @@ Note: TARGET, TO-REPLACE, and SWAP-DIRECTORY must be on the same file system."
(unless (= EMLINK (system-error-errno args))
(apply throw args)))))))
+(define %deduplication-minimum-size
+ ;; Size below which files are not deduplicated. This avoids adding too many
+ ;; entries to '.links', which would slow down 'removeUnusedLinks' while
+ ;; saving little space. Keep in sync with optimize-store.cc.
+ 8192)
+
(define* (deduplicate path hash #:key (store (%store-directory)))
"Check if a store item with sha256 hash HASH already exists. If so,
replace PATH with a hardlink to the already-existing one. If not, register
PATH so that future duplicates can hardlink to it. PATH is assumed to be
under STORE."
+ ;; Lightweight promises.
+ (define-syntax-rule (delay exp)
+ (let ((value #f))
+ (lambda ()
+ (unless value
+ (set! value exp))
+ value)))
+ (define-syntax-rule (force promise)
+ (promise))
+
(define links-directory
(string-append store "/.links"))
@@ -144,13 +186,18 @@ under STORE."
((file . properties)
(unless (member file '("." ".."))
(let* ((file (string-append path "/" file))
+ (st (delay (lstat file)))
(type (match (assoc-ref properties 'type)
((or 'unknown #f)
- (stat:type (lstat file)))
+ (stat:type (force st)))
(type type))))
- (loop file type
- (and (not (eq? 'directory type))
- (nar-sha256 file)))))))
+ (when (or (eq? 'directory type)
+ (and (eq? 'regular type)
+ (>= (stat:size (force st))
+ %deduplication-minimum-size)))
+ (loop file type
+ (and (not (eq? 'directory type))
+ (nar-sha256 file))))))))
(scandir* path))
(let ((link-file (string-append links-directory "/"
(bytevector->nix-base32-string hash))))
@@ -222,9 +269,9 @@ OUTPUT as it goes."
This procedure is suitable as a #:dump-file argument to 'restore-file'. When
used that way, it deduplicates files on the fly as they are restored, thereby
-removing the need to a deduplication pass that would re-read all the files
+removing the need for a deduplication pass that would re-read all the files
down the road."
- (define hash
+ (define (dump-and-compute-hash)
(call-with-output-file file
(lambda (output)
(let-values (((hash-port get-hash)
@@ -236,7 +283,11 @@ down the road."
(close-port hash-port)
(get-hash)))))
- (deduplicate file hash #:store store))
+ (if (>= size %deduplication-minimum-size)
+ (deduplicate file (dump-and-compute-hash) #:store store)
+ (call-with-output-file file
+ (lambda (output)
+ (dump-port input output size)))))
(define* (copy-file/deduplicate source target
#:key (store (%store-directory)))