summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRicardo Wurmus <ricardo.wurmus@mdc-berlin.de>2015-12-03 16:12:09 +0100
committerRicardo Wurmus <ricardo.wurmus@mdc-berlin.de>2015-12-11 15:35:07 +0100
commit0f6b9e9828dfc269bfc4eade771efed1753e8c62 (patch)
tree90bd8ab26d32ff8c22bbf0a089690705f0486724
parentb6a222757bfebdbf3b907b39f1c3b42967aaa915 (diff)
import: cran: Parse DESCRIPTION instead of HTML.
* guix/import/cran.scm (description->alist, listify, beautify-description, description->package): New procedures. (table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove proceduces. (latest-release): Use parsed DESCRIPTION instead of SXML. * tests/cran.scm: Rewrite to match importer.
-rw-r--r--guix/import/cran.scm265
-rw-r--r--tests/cran.scm189
2 files changed, 209 insertions, 245 deletions
diff --git a/guix/import/cran.scm b/guix/import/cran.scm
index 43dc2c80b6..845ecb5832 100644
--- a/guix/import/cran.scm
+++ b/guix/import/cran.scm
@@ -20,26 +20,26 @@
(define-module (guix import cran)
#:use-module (ice-9 match)
#:use-module (ice-9 regex)
+ #:use-module ((ice-9 rdelim) #:select (read-string))
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
- #:use-module (sxml simple)
- #:use-module (sxml match)
- #:use-module (sxml xpath)
#:use-module (guix http-client)
#:use-module (guix hash)
#:use-module (guix store)
#:use-module (guix base32)
#:use-module ((guix download) #:select (download-to-store))
#:use-module (guix import utils)
+ #:use-module ((guix build-system r) #:select (cran-uri))
#:use-module (guix upstream)
#:use-module (guix packages)
+ #:use-module (gnu packages)
#:export (cran->guix-package
%cran-updater))
;;; Commentary:
;;;
;;; Generate a package declaration template for the latest version of an R
-;;; package on CRAN, using the HTML description downloaded from
+;;; package on CRAN, using the DESCRIPTION file downloaded from
;;; cran.r-project.org.
;;;
;;; Code:
@@ -67,6 +67,31 @@
((lst ...) `(list ,@(map string->license lst)))
(_ #f)))
+
+(define (description->alist description)
+ "Convert a DESCRIPTION string into an alist."
+ (let ((lines (string-split description #\newline))
+ (parse (lambda (line acc)
+ (if (string-null? line) acc
+ ;; Keys usually start with a capital letter and end with
+ ;; ":". There are some exceptions, unfortunately (such
+ ;; as "biocViews"). There are no blanks in a key.
+ (if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line)
+ ;; New key/value pair
+ (let* ((pos (string-index line #\:))
+ (key (string-take line pos))
+ (value (string-drop line (+ 1 pos))))
+ (cons (cons key
+ (string-trim-both value))
+ acc))
+ ;; This is a continuation of the previous pair
+ (match-let ((((key . value) . rest) acc))
+ (cons (cons key (string-join
+ (list value
+ (string-trim-both line))))
+ rest)))))))
+ (fold parse '() lines)))
+
(define (format-inputs names)
"Generate a sorted list of package inputs from a list of package NAMES."
(map (lambda (name)
@@ -82,125 +107,94 @@ package definition."
((package-inputs ...)
`((,type (,'quasiquote ,(format-inputs package-inputs)))))))
-(define (table-datum tree label)
- "Extract the datum node following a LABEL in the sxml table TREE. Only the
-first cell of a table row is considered a label cell."
- ((node-pos 1)
- ((sxpath `(xhtml:tr
- (xhtml:td 1) ; only first cell can contain label
- (equal? ,label)
- ,(node-parent tree) ; go up to label cell
- ,(node-parent tree) ; go up to matching row
- (xhtml:td 2))) ; select second cell
- tree)))
-
(define %cran-url "http://cran.r-project.org/web/packages/")
(define (cran-fetch name)
- "Return an sxml representation of the CRAN page for the R package NAME,
-or #f on failure. NAME is case-sensitive."
+ "Return an alist of the contents of the DESCRIPTION file for the R package
+NAME, or #f on failure. NAME is case-sensitive."
;; This API always returns the latest release of the module.
- (let ((cran-url (string-append %cran-url name "/")))
- (false-if-exception
- (xml->sxml (http-fetch cran-url)
- #:trim-whitespace? #t
- #:namespaces '((xhtml . "http://www.w3.org/1999/xhtml"))
- #:default-entity-handler
- (lambda (port name)
- (case name
- ((nbsp) " ")
- ((ge) ">=")
- ((gt) ">")
- ((lt) "<")
- (else
- (format (current-warning-port)
- "~a:~a:~a: undefined entitity: ~a\n"
- cran-url (port-line port) (port-column port)
- name)
- (symbol->string name))))))))
-
-(define (downloads->url downloads)
- "Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the
-download URL."
- (string-append "mirror://cran/"
- ;; Remove double dots, because we want an
- ;; absolute path.
- (regexp-substitute/global
- #f "\\.\\./"
- (string-join ((sxpath '((xhtml:a 1) @ href *text*))
- (table-datum downloads " Package source: ")))
- 'pre 'post)))
-
-(define (nodes->text nodeset)
- "Return the concatenation of the text nodes among NODESET."
- (string-join ((sxpath '(// *text*)) nodeset) " "))
-
-(define (cran-sxml->sexp sxml)
- "Return the `package' s-expression for a CRAN package from the SXML
-representation of the package page."
+ (let ((url (string-append %cran-url name "/DESCRIPTION")))
+ (description->alist (read-string (http-fetch url)))))
+
+(define (listify meta field)
+ "Look up FIELD in the alist META. If FIELD contains a comma-separated
+string, turn it into a list and strip off parenthetic expressions. Return the
+empty list when the FIELD cannot be found."
+ (let ((value (assoc-ref meta field)))
+ (if (not value)
+ '()
+ ;; Strip off parentheses
+ (let ((items (string-split (regexp-substitute/global
+ #f "( *\\([^\\)]+\\)) *"
+ value 'pre 'post)
+ #\,)))
+ ;; When there is whitespace inside of items it is probably because
+ ;; this was not an actual list to begin with.
+ (remove (cut string-any char-set:whitespace <>)
+ (map string-trim-both items))))))
+
+(define (beautify-description description)
+ "Improve the package DESCRIPTION by turning a beginning sentence fragment
+into a proper sentence and by using two spaces between sentences."
+ (let ((cleaned (if (string-prefix? "A " description)
+ (string-append "This package provides a"
+ (substring description 1))
+ description)))
+ ;; Use double spacing between sentences
+ (regexp-substitute/global #f "\\. \\b"
+ cleaned 'pre ". " 'post)))
+
+(define (description->package meta)
+ "Return the `package' s-expression for a CRAN package from the alist META,
+which was derived from the R package's DESCRIPTION file."
(define (guix-name name)
(if (string-prefix? "r-" name)
(string-downcase name)
(string-append "r-" (string-downcase name))))
- (sxml-match-let*
- (((*TOP* (xhtml:html
- ,head
- (xhtml:body
- (xhtml:h2 ,name-and-synopsis)
- (xhtml:p ,description)
- ,summary
- (xhtml:h4 "Downloads:") ,downloads
- . ,rest)))
- sxml))
- (let* ((name (match:prefix (string-match ": " name-and-synopsis)))
- (synopsis (match:suffix (string-match ": " name-and-synopsis)))
- (version (nodes->text (table-datum summary "Version:")))
- (license ((compose string->license nodes->text)
- (table-datum summary "License:")))
- (home-page (nodes->text ((sxpath '((xhtml:a 1)))
- (table-datum summary "URL:"))))
- (source-url (downloads->url downloads))
- (tarball (with-store store (download-to-store store source-url)))
- (sysdepends (map match:substring
- (list-matches
- "[^ ]+"
- ;; Strip off comma and parenthetical
- ;; expressions.
- (regexp-substitute/global
- #f "(,|\\([^\\)]+\\))"
- (nodes->text (table-datum summary
- "SystemRequirements:"))
- 'pre 'post))))
- (imports (map guix-name
- ((sxpath '(// xhtml:a *text*))
- (table-datum summary "Imports:")))))
- `(package
- (name ,(guix-name name))
- (version ,version)
- (source (origin
- (method url-fetch)
- (uri (cran-uri ,name version))
- (sha256
- (base32
- ,(bytevector->nix-base32-string (file-sha256 tarball))))))
- (build-system r-build-system)
- ,@(maybe-inputs sysdepends)
- ,@(maybe-inputs imports 'propagated-inputs)
- (home-page ,(if (string-null? home-page)
- (string-append %cran-url name)
- home-page))
- (synopsis ,synopsis)
- ;; Use double spacing
- (description ,(regexp-substitute/global #f "\\. \\b" description
- 'pre ". " 'post))
- (license ,license)))))
+ (let* ((name (assoc-ref meta "Package"))
+ (synopsis (assoc-ref meta "Title"))
+ (version (assoc-ref meta "Version"))
+ (license (string->license (assoc-ref meta "License")))
+ ;; Some packages have multiple home pages. Some have none.
+ (home-page (match (listify meta "URL")
+ ((url rest ...) url)
+ (_ (string-append %cran-url name))))
+ (source-url (match (cran-uri name version)
+ ((url rest ...) url)
+ (_ #f)))
+ (tarball (with-store store (download-to-store store source-url)))
+ (sysdepends (map string-downcase (listify meta "SystemRequirements")))
+ (propagate (map guix-name (lset-union equal?
+ (listify meta "Imports")
+ (listify meta "LinkingTo")
+ (delete "R"
+ (listify meta "Depends"))))))
+ `(package
+ (name ,(guix-name name))
+ (version ,version)
+ (source (origin
+ (method url-fetch)
+ (uri (cran-uri ,name version))
+ (sha256
+ (base32
+ ,(bytevector->nix-base32-string (file-sha256 tarball))))))
+ (properties ,`(,'quasiquote ((,'upstream-name . ,name))))
+ (build-system r-build-system)
+ ,@(maybe-inputs sysdepends)
+ ,@(maybe-inputs propagate 'propagated-inputs)
+ (home-page ,(if (string-null? home-page)
+ (string-append %cran-url name)
+ home-page))
+ (synopsis ,synopsis)
+ (description ,(beautify-description (assoc-ref meta "Description")))
+ (license ,license))))
(define (cran->guix-package package-name)
"Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the
`package' s-expression corresponding to that package, or #f on failure."
(let ((module-meta (cran-fetch package-name)))
- (and=> module-meta cran-sxml->sexp)))
+ (and=> module-meta description->package)))
;;;
@@ -209,32 +203,33 @@ representation of the package page."
(define (latest-release package)
"Return an <upstream-source> for the latest release of PACKAGE."
- (define name
- (if (string-prefix? "r-" package)
- (string-drop package 2)
- package))
-
- (define sxml
- (cran-fetch name))
-
- (and sxml
- (sxml-match-let*
- (((*TOP* (xhtml:html
- ,head
- (xhtml:body
- (xhtml:h2 ,name-and-synopsis)
- (xhtml:p ,description)
- ,summary
- (xhtml:h4 "Downloads:") ,downloads
- . ,rest)))
- sxml))
- (let ((version (nodes->text (table-datum summary "Version:")))
- (url (downloads->url downloads)))
- ;; CRAN does not provide signatures.
- (upstream-source
- (package package)
- (version version)
- (urls (list url)))))))
+
+ (define (package->cran-name package)
+ (match (package-source package)
+ ((? origin? origin)
+ (match (origin-uri origin)
+ ((url rest ...)
+ (let ((end (string-rindex url #\_))
+ (start (string-rindex url #\/)))
+ ;; The URL ends on
+ ;; (string-append "/" name "_" version ".tar.gz")
+ (substring url start end)))
+ (_ #f)))
+ (_ #f)))
+
+ (define cran-name
+ (package->cran-name (specification->package package)))
+
+ (define meta
+ (cran-fetch cran-name))
+
+ (and meta
+ (let ((version (assoc-ref meta "Version")))
+ ;; CRAN does not provide signatures.
+ (upstream-source
+ (package package)
+ (version version)
+ (urls (cran-uri cran-name version))))))
(define (cran-package? package)
"Return true if PACKAGE is an R package from CRAN."
diff --git a/tests/cran.scm b/tests/cran.scm
index ba5699a133..0a4a2fdd8f 100644
--- a/tests/cran.scm
+++ b/tests/cran.scm
@@ -19,120 +19,84 @@
(define-module (test-cran)
#:use-module (guix import cran)
#:use-module (guix tests)
+ #:use-module (srfi srfi-1)
#:use-module (srfi srfi-64)
+ #:use-module (srfi srfi-26)
#:use-module (ice-9 match))
-(define sxml
- '(*TOP* (xhtml:html
- (xhtml:head
- (xhtml:title "CRAN - Package my-example-sxml"))
- (xhtml:body
- (xhtml:h2 "my-example-sxml: Short description")
- (xhtml:p "Long description")
- (xhtml:table
- (@ (summary "Package my-example-sxml summary"))
- (xhtml:tr
- (xhtml:td "Version:")
- (xhtml:td "1.2.3"))
- (xhtml:tr
- (xhtml:td "Depends:")
- (xhtml:td "R (>= 3.1.0)"))
- (xhtml:tr
- (xhtml:td "SystemRequirements:")
- (xhtml:td "cairo (>= 1.2 http://www.cairographics.org/)"))
- (xhtml:tr
- (xhtml:td "Imports:")
- (xhtml:td
- (xhtml:a (@ (href "../scales/index.html"))
- "scales")
- " (>= 0.2.3), "
- (xhtml:a (@ (href "../proto/index.html"))
- "proto")
- ", "
- (xhtml:a (@ (href "../Rcpp/index.html")) "Rcpp")
- " (>= 0.11.0)"))
- (xhtml:tr
- (xhtml:td "Suggests:")
- (xhtml:td
- (xhtml:a (@ (href "../some/index.html"))
- "some")
- ", "
- (xhtml:a (@ (href "../suggestions/index.html"))
- "suggestions")))
- (xhtml:tr
- (xhtml:td "License:")
- (xhtml:td
- (xhtml:a (@ (href "../../licenses/MIT")) "MIT")))
- (xhtml:tr
- (xhtml:td "URL:")
- (xhtml:td
- (xhtml:a (@ (href "http://gnu.org/s/my-example-sxml"))
- "http://gnu.org/s/my-example-sxml")
- ", "
- (xhtml:a (@ (href "http://alternative/home/page"))
- "http://alternative/home/page"))))
- (xhtml:h4 "Downloads:")
- (xhtml:table
- (@ (summary "Package my-example-sxml downloads"))
- (xhtml:tr
- (xhtml:td " Reference manual: ")
- (xhtml:td
- (xhtml:a (@ (href "my-example-sxml.pdf"))
- " my-example-sxml.pdf ")))
- (xhtml:tr
- (xhtml:td " Package source: ")
- (xhtml:td
- (xhtml:a
- (@ (href "../../../src/contrib/my-example-sxml_1.2.3.tar.gz"))
- " my-example-sxml_1.2.3.tar.gz "))))
- (xhtml:h4 "Reverse dependencies:")
- (xhtml:table
- (@ (summary "Package my-example-sxml reverse dependencies"))
- (xhtml:tr
- (xhtml:td "Reverse depends:")
- (xhtml:td "Too many."))
- (xhtml:tr
- (xhtml:td "Reverse imports:")
- (xhtml:td "Likewise."))
- (xhtml:tr
- (xhtml:td "Reverse suggests:")
- (xhtml:td "Uncountable.")))))))
+(define description "
+Package: My-Example
+Type: Package
+Title: Example package
+Version: 1.2.3
+Date: 2015-12-10
+Author: Ricardo Wurmus
+Maintainer: Guix Schmeeks <guix@gnu.org>
+URL: http://gnu.org/s/my-example
+Description: This is a long description
+spanning multiple lines: and it could confuse the parser that
+there is a colon : on the lines.
+ And: this line continues the description.
+biocViews: 0
+SystemRequirements: Cairo (>= 0)
+Depends: A C++11 compiler. Version 4.6.* of g++ (as
+ currently in Rtools) is insufficient; versions 4.8.*, 4.9.* or
+ later will be fine.
+License: GPL (>= 3)
+Imports: Rcpp (>= 0.11.5), proto, Scales
+LinkingTo: Rcpp, BH
+NeedsCompilation: yes
+Repository: CRAN
+Date/Publication: 2015-07-14 14:15:16
+")
-(define simple-table
- '(xhtml:table
- (xhtml:tr
- (xhtml:td "Numbers")
- (xhtml:td "123"))
- (xhtml:tr
- (@ (class "whatever"))
- (xhtml:td (@ (class "unimportant")) "Letters")
- (xhtml:td "abc"))
- (xhtml:tr
- (xhtml:td "Letters")
- (xhtml:td "xyz"))
- (xhtml:tr
- (xhtml:td "Single"))
- (xhtml:tr
- (xhtml:td "not a value")
- (xhtml:td "not a label")
- (xhtml:td "also not a label"))))
+(define description-alist
+ ((@@ (guix import cran) description->alist) description))
+
+(define simple-alist
+ '(("Key" . "Value")
+ ("SimpleList" . "R, Rcpp, something, whatever")
+ ("BadList" . "This is not a real list, you know?")
+ ("List" . "R (>= 2.2), BH (for no reason), GenomicRanges")))
(test-begin "cran")
-(test-equal "table-datum: return list of first table cell matching label"
- '((xhtml:td "abc"))
- ((@@ (guix import cran) table-datum) simple-table "Letters"))
+(test-assert "description->alist: contains all valid keys"
+ (let ((keys '("Package" "Type" "Title" "Version" "Date"
+ "Author" "Maintainer" "URL" "Description"
+ "SystemRequirements" "Depends" "License"
+ "Imports" "biocViews" "LinkingTo"
+ "NeedsCompilation" "Repository"
+ "Date/Publication")))
+ (lset= string=? keys (map car description-alist))))
-(test-equal "table-datum: return empty list if no match"
+(test-equal "listify: return empty list if key cannot be found"
'()
- ((@@ (guix import cran) table-datum) simple-table "Astronauts"))
+ ((@@ (guix import cran) listify) simple-alist "Letters"))
+
+(test-equal "listify: split comma-separated value into elements"
+ '("R" "Rcpp" "something" "whatever")
+ ((@@ (guix import cran) listify) simple-alist "SimpleList"))
-(test-equal "table-datum: only consider the first cell as a label cell"
+(test-equal "listify: strip off parentheses"
+ '("R" "BH" "GenomicRanges")
+ ((@@ (guix import cran) listify) simple-alist "List"))
+
+(test-equal "listify: ignore values that are no lists"
'()
- ((@@ (guix import cran) table-datum) simple-table "not a label"))
+ ((@@ (guix import cran) listify) simple-alist "BadList"))
+
+(test-equal "beautify-description: use double spacing"
+ "This is a package. It is great. Trust me Mr. Hendrix."
+ ((@@ (guix import cran) beautify-description)
+ "This is a package. It is great. Trust me Mr. Hendrix."))
+(test-equal "beautify-description: transform fragment into sentence"
+ "This package provides a function to establish world peace"
+ ((@@ (guix import cran) beautify-description)
+ "A function to establish world peace"))
-(test-assert "cran-sxml->sexp"
+(test-assert "description->package"
;; Replace network resources with sample data.
(mock ((guix build download) url-fetch
(lambda* (url file-name #:key (mirrors '()))
@@ -140,32 +104,37 @@
(lambda ()
(display
(match url
- ("mirror://cran/src/contrib/my-example-sxml_1.2.3.tar.gz"
+ ("mirror://cran/src/contrib/My-Example_1.2.3.tar.gz"
"source")
(_ (error "Unexpected URL: " url))))))))
- (match ((@@ (guix import cran) cran-sxml->sexp) sxml)
+ (match ((@@ (guix import cran) description->package) description-alist)
(('package
- ('name "r-my-example-sxml")
+ ('name "r-my-example")
('version "1.2.3")
('source ('origin
('method 'url-fetch)
- ('uri ('cran-uri "my-example-sxml" 'version))
+ ('uri ('cran-uri "My-Example" 'version))
('sha256
('base32
(? string? hash)))))
+ ('properties ('quasiquote (('upstream-name . "My-Example"))))
('build-system 'r-build-system)
('inputs
('quasiquote
(("cairo" ('unquote 'cairo)))))
('propagated-inputs
('quasiquote
- (("r-proto" ('unquote 'r-proto))
+ (("r-bh" ('unquote 'r-bh))
+ ("r-proto" ('unquote 'r-proto))
("r-rcpp" ('unquote 'r-rcpp))
("r-scales" ('unquote 'r-scales)))))
- ('home-page "http://gnu.org/s/my-example-sxml")
- ('synopsis "Short description")
- ('description "Long description")
- ('license 'x11)))
+ ('home-page "http://gnu.org/s/my-example")
+ ('synopsis "Example package")
+ ('description
+ "This is a long description spanning multiple lines: \
+and it could confuse the parser that there is a colon : on the \
+lines. And: this line continues the description.")
+ ('license 'gpl3+)))
(x
(begin
(format #t "~s\n" x)