diff options
author | Maxim Cournoyer <maxim.cournoyer@gmail.com> | 2022-05-03 16:05:36 -0400 |
---|---|---|
committer | Maxim Cournoyer <maxim.cournoyer@gmail.com> | 2022-05-31 14:52:33 -0400 |
commit | 4820a23521727091d0085e5b381aa5c44ebd2ecb (patch) | |
tree | e32659de5d2965cff077d3664aa2a9d143478e2a | |
parent | 0999af5b42540f2d5f4b52c65a7e350f071d2f3c (diff) |
gnu: Add python-extruct.
* gnu/packages/python-web.scm (python-extruct): New variable.
-rw-r--r-- | gnu/packages/python-web.scm | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/gnu/packages/python-web.scm b/gnu/packages/python-web.scm index 59828d7473..427994e22b 100644 --- a/gnu/packages/python-web.scm +++ b/gnu/packages/python-web.scm @@ -97,6 +97,7 @@ #:use-module (gnu packages python-science) #:use-module (gnu packages python-xyz) #:use-module (gnu packages qt) + #:use-module (gnu packages rdf) #:use-module (gnu packages rpc) #:use-module (gnu packages serialization) #:use-module (gnu packages sphinx) @@ -7441,3 +7442,49 @@ characters in a smarter, more visually pleasing style.") implementing the full Microformats2 (mf2) specification, including backward compatibility with Microformats1 (mf1).") (license license:expat))) + +(define-public python-extruct + (package + (name "python-extruct") + (version "0.13.0") + (source (origin + (method git-fetch) ;for tests + (uri (git-reference + (url "https://github.com/scrapinghub/extruct") + (commit (string-append "v" version)))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "075zldf3dqcc429z1vk2ngbmv034bnlyk6arh3rh30jbsvz9pzl5")))) + (build-system python-build-system) + (arguments + (list + #:phases + #~(modify-phases %standard-phases + (replace 'check + (lambda* (#:key tests? #:allow-other-keys) + (when tests? + (invoke "pytest" "-vv" "tests"))))))) + (native-inputs (list python-pytest)) + (propagated-inputs + (list python-html-text + python-jstyleson + python-lxml + python-mf2py + python-pyrdfa3 + python-rdflib + python-rdflib-jsonld + python-w3lib)) + (home-page "https://github.com/scrapinghub/extruct") + (synopsis "Extract embedded metadata from HTML markup") + (description "@code{extruct} is a Python library for extracting embedded +metadata from HTML markup. Currently, extruct supports: +@itemize +@item W3C's HTML Microdata +@item embedded JSON-LD +@item Microformat via mf2py +@item Facebook's Open Graph +@item (experimental) RDFa via rdflib +@item Dublin Core Metadata (DC-HTML-2003) +@end itemize") + (license license:bsd-3))) |