From d797a0d02c4328108895aa4a327a7d9c85ca449d Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Wed, 28 Dec 2022 17:01:05 +0100
Subject: [PATCH] Add module (geode packages encoding) containing various
 projects used to process input files

---
 geode/packages/encoding.scm | 100 ++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 geode/packages/encoding.scm

diff --git a/geode/packages/encoding.scm b/geode/packages/encoding.scm
new file mode 100644
index 0000000..04bcfa8
--- /dev/null
+++ b/geode/packages/encoding.scm
@@ -0,0 +1,100 @@
+(define-module (geode packages encoding)
+  #:use-module ((gnu packages compression) #:select (unzip zip))
+  #:use-module ((gnu packages haskell-xyz) #:select (ghc-attoparsec
+                                                     ghc-edit-distance
+                                                     ghc-optparse-applicative
+                                                     ghc-pipes
+                                                     ghc-xml))
+  #:use-module ((guix build-system gnu) #:select (gnu-build-system))
+  #:use-module ((guix build-system haskell) #:select (hackage-uri haskell-build-system))
+  #:use-module ((guix download) #:select (url-fetch))
+  #:use-module ((guix git-download) #:select (git-fetch git-reference))
+  #:use-module ((guix licenses) #:select (bsd-3))
+  #:use-module ((guix packages) #:select (package origin base32)))
+
+(define-public ghc-html5-entity
+  (package
+    (name "ghc-html5-entity")
+    (version "0.2.0.3")
+    (source (origin
+              (method url-fetch)
+              (uri (hackage-uri "html5-entity" version))
+              (sha256
+               (base32
+                "0bmmzshxanzw5y2y0hvgzz9yw18jqgv535i1xq2a5lf7w8wpj1if"))))
+    (build-system haskell-build-system)
+    (home-page "https://github.com/zudov/html5-entity/")
+    (synopsis "A library for looking up and validating HTML5 entities.")
+    (description
+     "This package provides a library for looking up and validating HTML5 entities.
+  The <http://html.spec.whatwg.org/multipage/entities.json following> document is
+  used as an authoritative source of the valid entity names and their
+  corresponding codepoints.  You can think of this library as about bindings to
+  the data from that file.  For usage see the Text.Html5.Entity module.")
+    (license bsd-3)))
+
+(define-public ghc-xmlfilter
+  (package
+    (name "ghc-xmlfilter")
+    (version "0.3.1.0")
+    (home-page "https://gitlab.huma-num.fr/alicebrenon/XMLFilter")
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference (url home-page) (commit version)))
+              (sha256
+                (base32
+                  "1prsrhpzqx7wcbvzvmc4845f03bvxfazn3v2kfajnr1fnrcbb8nf"))))
+    (build-system haskell-build-system)
+    (inputs (list ghc-attoparsec))
+    (synopsis "SAX-based library to handle XML-like documents")
+    (description
+      "Reads the files as streams of markup events to allow altering the file
+      content on the fly and fixing broken files which aren't proper XML")
+      (license bsd-3)))
+
+(define-public processing-lge
+  (package
+    (name "ProcessingLGE")
+    (version "0.1.0.0")
+    (home-page "https://gitlab.huma-num.fr/disco-lge/processinglge")
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference (url home-page) (commit version)))
+              (sha256
+                (base32
+                  "0hzlcwy2zdzgy5a2il19zk24159lwpgnbv4cqbi8jpxv5jwb0sww"))))
+    (build-system gnu-build-system)
+    (arguments
+      '(#:phases (modify-phases %standard-phases
+                                (delete 'check))))
+    (propagated-inputs (list soprano unzip zip))
+    (synopsis "Scripts and data for project DISCO")
+    (description
+      "Scripts to process La Grande Encyclopédie and suggestions of filters to
+      clean the ALTO files.")
+      (license bsd-3)))
+
+(define-public soprano
+  (package
+    (name "soprano")
+    (version "0.2.0.1")
+    (home-page "https://gitlab.huma-num.fr/disco-lge/soprano")
+    (source (origin
+              (method git-fetch)
+              (uri (git-reference (url home-page) (commit version)))
+              (sha256
+                (base32 "17njp16nn2lf7xldkdbkhm7p7vji61pmli3fsdbk9azys1k2yhvy"))))
+    (build-system haskell-build-system)
+    (inputs (list ghc-edit-distance ghc-pipes ghc-roman-numerals ghc-xml
+                  ghc-optparse-applicative))
+    (synopsis "A tool to extract textual content from ALTO files")
+    (description
+     "Soprano is a tool developed for project GÉODE
+(@xref{https://geode-project.github.io/}) which studies the geographic discourse
+in encyclopedias. It was desiged to extract encyclopedia articles from OCRed
+pages represented by a set of ALTO files.
+
+It lets one apply various filtering in the process, for instance specifying the
+type of ALTO blocks to retain or setting a quality threshold on OCR recognition.
+Articles can be output in raw text or encoded in XML-TEI.")
+    (license bsd-3)))
-- 
GitLab