From 32ffbb16e8cbb9c5416274320a56885c45a88ebf Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer Date: Mon, 2 May 2022 00:39:09 -0400 Subject: gnu: Add python-html-text. * gnu/packages/python-web.scm (python-html-text): New variable. --- gnu/packages/python-web.scm | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/gnu/packages/python-web.scm b/gnu/packages/python-web.scm index cb52db3bbd..00fe560a36 100644 --- a/gnu/packages/python-web.scm +++ b/gnu/packages/python-web.scm @@ -7387,3 +7387,25 @@ mining to monitoring and automated testing.") Contrary to the standard Python @code{json} library, it understands js-style comments. Trailing comma is also supported.") (license license:expat))) + +(define-public python-html-text + (package + (name "python-html-text") + (version "0.5.2") + (source + (origin + (method url-fetch) + (uri (pypi-uri "html_text" version)) + (sha256 + (base32 "1v9x171l3bmyayc1144nrkn9410lp4lhlrrjii54j7b5f2xipmmg")))) + (build-system python-build-system) + (native-inputs (list python-pytest)) + (propagated-inputs (list python-lxml)) + (home-page "https://github.com/TeamHG-Memex/html-text") + (synopsis "Extract text from HTML") + (description "HTML to Text is a Python library for extract text from HTML. +Contrary to other solution such as LXML or Beautiful Soup, the text extracted +with @code{html_text} does not contain elements such as JavaScript or inline +styles not normally visible to users. It also normalizes white space +characters in a smarter, more visually pleasing style.") + (license license:expat))) -- cgit v1.2.3