From d797a0d02c4328108895aa4a327a7d9c85ca449d Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Wed, 28 Dec 2022 17:01:05 +0100 Subject: [PATCH] Add module (geode packages encoding) containing various projects used to process input files --- geode/packages/encoding.scm | 100 ++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 geode/packages/encoding.scm diff --git a/geode/packages/encoding.scm b/geode/packages/encoding.scm new file mode 100644 index 0000000..04bcfa8 --- /dev/null +++ b/geode/packages/encoding.scm @@ -0,0 +1,100 @@ +(define-module (geode packages encoding) + #:use-module ((gnu packages compression) #:select (unzip zip)) + #:use-module ((gnu packages haskell-xyz) #:select (ghc-attoparsec + ghc-edit-distance + ghc-optparse-applicative + ghc-pipes + ghc-xml)) + #:use-module ((guix build-system gnu) #:select (gnu-build-system)) + #:use-module ((guix build-system haskell) #:select (hackage-uri haskell-build-system)) + #:use-module ((guix download) #:select (url-fetch)) + #:use-module ((guix git-download) #:select (git-fetch git-reference)) + #:use-module ((guix licenses) #:select (bsd-3)) + #:use-module ((guix packages) #:select (package origin base32))) + +(define-public ghc-html5-entity + (package + (name "ghc-html5-entity") + (version "0.2.0.3") + (source (origin + (method url-fetch) + (uri (hackage-uri "html5-entity" version)) + (sha256 + (base32 + "0bmmzshxanzw5y2y0hvgzz9yw18jqgv535i1xq2a5lf7w8wpj1if")))) + (build-system haskell-build-system) + (home-page "https://github.com/zudov/html5-entity/") + (synopsis "A library for looking up and validating HTML5 entities.") + (description + "This package provides a library for looking up and validating HTML5 entities. + The <http://html.spec.whatwg.org/multipage/entities.json following> document is + used as an authoritative source of the valid entity names and their + corresponding codepoints. You can think of this library as about bindings to + the data from that file. For usage see the Text.Html5.Entity module.") + (license bsd-3))) + +(define-public ghc-xmlfilter + (package + (name "ghc-xmlfilter") + (version "0.3.1.0") + (home-page "https://gitlab.huma-num.fr/alicebrenon/XMLFilter") + (source (origin + (method git-fetch) + (uri (git-reference (url home-page) (commit version))) + (sha256 + (base32 + "1prsrhpzqx7wcbvzvmc4845f03bvxfazn3v2kfajnr1fnrcbb8nf")))) + (build-system haskell-build-system) + (inputs (list ghc-attoparsec)) + (synopsis "SAX-based library to handle XML-like documents") + (description + "Reads the files as streams of markup events to allow altering the file + content on the fly and fixing broken files which aren't proper XML") + (license bsd-3))) + +(define-public processing-lge + (package + (name "ProcessingLGE") + (version "0.1.0.0") + (home-page "https://gitlab.huma-num.fr/disco-lge/processinglge") + (source (origin + (method git-fetch) + (uri (git-reference (url home-page) (commit version))) + (sha256 + (base32 + "0hzlcwy2zdzgy5a2il19zk24159lwpgnbv4cqbi8jpxv5jwb0sww")))) + (build-system gnu-build-system) + (arguments + '(#:phases (modify-phases %standard-phases + (delete 'check)))) + (propagated-inputs (list soprano unzip zip)) + (synopsis "Scripts and data for project DISCO") + (description + "Scripts to process La Grande Encyclopédie and suggestions of filters to + clean the ALTO files.") + (license bsd-3))) + +(define-public soprano + (package + (name "soprano") + (version "0.2.0.1") + (home-page "https://gitlab.huma-num.fr/disco-lge/soprano") + (source (origin + (method git-fetch) + (uri (git-reference (url home-page) (commit version))) + (sha256 + (base32 "17njp16nn2lf7xldkdbkhm7p7vji61pmli3fsdbk9azys1k2yhvy")))) + (build-system haskell-build-system) + (inputs (list ghc-edit-distance ghc-pipes ghc-roman-numerals ghc-xml + ghc-optparse-applicative)) + (synopsis "A tool to extract textual content from ALTO files") + (description + "Soprano is a tool developed for project GÉODE +(@xref{https://geode-project.github.io/}) which studies the geographic discourse +in encyclopedias. It was desiged to extract encyclopedia articles from OCRed +pages represented by a set of ALTO files. + +It lets one apply various filtering in the process, for instance specifying the +type of ALTO blocks to retain or setting a quality threshold on OCR recognition. +Articles can be output in raw text or encoded in XML-TEI.") + (license bsd-3))) -- GitLab