From cb2344573c81e1fa019ed3dec6eda85cbbf0cc18 Mon Sep 17 00:00:00 2001
From: Alice BRENON <alice.brenon@ens-lyon.fr>
Date: Thu, 7 Sep 2023 18:50:36 +0200
Subject: [PATCH] Add a script to explode (Text) articles into paragraphs,
 outputing a files.tsv with a primary key for each of them in the process

---
 lib/GEODE/Metadata/File.hs                 |  6 +++
 lib/GEODE/Metadata/PrimaryKey/Paragraph.hs | 52 ++++++++++++++++++++++
 manifest.scm                               |  3 +-
 scripts/paragraphs.hs                      | 39 ++++++++++++++++
 4 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 lib/GEODE/Metadata/File.hs
 create mode 100644 lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
 create mode 100755 scripts/paragraphs.hs

diff --git a/lib/GEODE/Metadata/File.hs b/lib/GEODE/Metadata/File.hs
new file mode 100644
index 0000000..9565b0d
--- /dev/null
+++ b/lib/GEODE/Metadata/File.hs
@@ -0,0 +1,6 @@
+module GEODE.Metadata.File
+  ( File(..) ) where
+
+class File a where
+  uid :: a -> String
+  relativePath :: a -> String -> FilePath
diff --git a/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
new file mode 100644
index 0000000..e0d906f
--- /dev/null
+++ b/lib/GEODE/Metadata/PrimaryKey/Paragraph.hs
@@ -0,0 +1,52 @@
+{-# LANGUAGE NamedFieldPuns, OverloadedStrings #-}
+module GEODE.Metadata.PrimaryKey.Paragraph
+  ( ParagraphPK(..) ) where
+
+import Data.Aeson ((.=), ToJSON(..), object, pairs)
+import Data.Csv
+  ( (.:), FromNamedRecord(..), ToNamedRecord(..), namedField, namedRecord )
+import GEODE.Metadata.File (File(..))
+import GEODE.Metadata as Article
+  (DefaultFields(..), HasDefaultHeader(..), PrimaryKey(PrimaryKey, book, tome), uid, relativePath)
+import qualified GEODE.Metadata as Article (PrimaryKey(rank))
+import System.FilePath ((</>), (<.>), dropExtension)
+import Text.Printf (printf)
+
+data ParagraphPK = ParagraphPK
+  { article :: PrimaryKey
+  , rank :: Int } deriving (Eq, Ord, Show)
+
+instance FromNamedRecord ParagraphPK where
+  parseNamedRecord nr = ParagraphPK
+    <$> (PrimaryKey <$> nr .: "book" <*> nr .: "tome" <*> nr .: "article#")
+    <*> nr .: "paragraph#"
+
+instance ToNamedRecord ParagraphPK where
+  toNamedRecord (ParagraphPK {article, rank}) = namedRecord
+    [ namedField "book" (book article)
+    , namedField "tome" (tome article)
+    , namedField "article#" (Article.rank article)
+    , namedField "paragraph#" rank ]
+
+instance File ParagraphPK where
+  uid (ParagraphPK {article, rank}) = printf "%s_%d" (Article.uid article) rank
+
+  relativePath (ParagraphPK {article, rank}) extension =
+    articleDirectory </> (show rank) <.> extension
+    where
+      articleDirectory = dropExtension (Article.relativePath article "")
+
+instance HasDefaultHeader ParagraphPK where
+  defaultFields = DefaultFields (["book", "tome", "article#", "paragraph#"])
+
+instance ToJSON ParagraphPK where
+  toJSON (ParagraphPK {article, rank}) = object
+    [ "book" .= book article
+    , "tome" .= tome article
+    , "article#" .= Article.rank article
+    , "paragraph#" .= rank ]
+  toEncoding (ParagraphPK {article, rank}) = pairs
+    ( "book" .= book article
+    <> "tome" .= tome article
+    <> "article#" .= Article.rank article
+    <> "paragraph#" .= rank )
diff --git a/manifest.scm b/manifest.scm
index a8ead9b..bf7afd5 100644
--- a/manifest.scm
+++ b/manifest.scm
@@ -2,7 +2,7 @@
              ((geode packages models) #:select (stanza-fr))
              ((gnu packages commencement) #:select (gcc-toolchain))
              ((gnu packages haskell) #:select (ghc))
-             ((gnu packages haskell-web) #:select (ghc-hxt))
+             ((gnu packages haskell-web) #:select (ghc-aeson ghc-hxt))
              ((gnu packages haskell-xyz) #:select (ghc-cassava ghc-hs-conllu))
              ((gnu packages python) #:select (python))
              ((gnu packages python-science) #:select (python-pandas))
@@ -19,6 +19,7 @@
     ;edda-clinic ; fix and cut the EDdA
     gcc-toolchain ; running haskell
     ghc ; running haskell
+    ghc-aeson ; working with JSON in haskell
     ghc-cassava ; working with CSV in haskell
     ;ghc-geode ; handling corpus files
     ghc-hs-conllu ; working on syntax-annotated documents
diff --git a/scripts/paragraphs.hs b/scripts/paragraphs.hs
new file mode 100755
index 0000000..453b943
--- /dev/null
+++ b/scripts/paragraphs.hs
@@ -0,0 +1,39 @@
+#!/usr/bin/env -S runhaskell --ghc-arg="-Wall" --ghc-arg="-i lib"
+{-# LANGUAGE OverloadedStrings #-}
+import Data.List (dropWhileEnd)
+import Data.Text (Text, splitOn)
+import Data.Text.IO as Text (readFile, writeFile)
+import GEODE.Metadata.File as File (File(..))
+import GEODE.Metadata as Article (PrimaryKey, readNamedTsv, relativePath, tsvFile)
+import GEODE.Metadata.PrimaryKey.Paragraph (ParagraphPK(..))
+import System.Directory (createDirectoryIfMissing)
+import System.Environment (getArgs)
+import System.FilePath ((</>), (<.>), isPathSeparator, takeDirectory)
+import System.Script (syntax, try)
+import Text.Filter (Editable(..))
+import Text.Filter.Linearize (linearize)
+
+articleParagraphs :: FilePath -> IO [Text]
+articleParagraphs =
+  fmap (fmap (leave . linearize False . enter) . splitOn "\n\n") . Text.readFile
+
+to :: FilePath -> FilePath -> PrimaryKey -> IO [ParagraphPK]
+to source target article = do
+  createDirectoryIfMissing True (target </> Article.relativePath article "")
+  articleParagraphs articlePath >>= mapM create . number
+  where
+    articlePath = source </> Article.relativePath article "txt"
+    number = zip [1..]
+    create (rank, paragraphText) =
+      let paragraphPK = ParagraphPK {article, rank}
+          outputPath = target </> File.relativePath paragraphPK "txt" in
+      paragraphPK <$ Text.writeFile outputPath paragraphText
+
+main :: IO ()
+main = (fmap (dropWhileEnd isPathSeparator) <$> getArgs) >>= run
+  where
+    run [inputMeta, source, target] =
+        try (readNamedTsv inputMeta)
+      >>= mapM (source `to` target)
+      >>= tsvFile (takeDirectory target </> "files" <.> "tsv") . concat
+    run _ = syntax "INPUT_METADATA SOURCE_DIRECTORY TARGET_DIRECTORY"
-- 
GitLab